In [4]:
import csv
from tqdm import tqdm

def process_memlog_in_chunks(file_path):
    stores_file = open('stores.csv', 'w', newline='')
    allocation_sites_file = open('allocation_sites.csv', 'w', newline='')

    stores_writer = csv.writer(stores_file)
    allocation_sites_writer = csv.writer(allocation_sites_file)

    # Write header for the CSV files
    stores_writer.writerow(['address', 'value'])
    allocation_sites_writer.writerow(['start', 'size', 'stack_trace'])

    store_started = False
    allocation_started = False
    allocation_sites = []
    stack_trace = []

    # Open the file and count the total number of lines for tqdm
    with open(file_path, 'r') as f:
        total_lines = sum(1 for line in f)  # Count total lines in the file
        f.seek(0)  # Rewind the file to start processing

        # Create a tqdm progress bar for line processing
        for line in tqdm(f, total=total_lines, desc="Processing lines", unit="line"):
            # Skip lines before "Parent PID" and the following line
            if 'Parent PID' in line:
                store_started = True
                continue
            elif store_started and line.strip() == '':
                continue

            # Process store lines
            if store_started and not allocation_started:
                # Ignore the line after the line containing 'Parent PID'
                if len(line.strip().split()) == 1:
                    continue

                # Look for the "=== Allocation sites ===" line
                if line.strip() == '=== Allocation sites ===':
                    allocation_started = True
                    continue

                # Process store lines: <address> <value>
                if line.strip():
                    address, value = line.split()
                    stores_writer.writerow([address, value])

            # Process allocation site lines
            if allocation_started:
                if line.strip():
                    if line.startswith('Start'):
                        # Capture the start and size from the allocation site line
                        start, size = line.split(',')[0].split()[1], line.split(',')[1].split()[1]
                        # If there's already a previous allocation site, write it before starting a new one
                        if allocation_sites:
                            allocation_sites_writer.writerow([allocation_sites[0], allocation_sites[1], ' '.join(allocation_sites[2])])
                        # Start a new allocation site
                        allocation_sites = [start, size, []]
                    else:
                        # Add stack trace line for the current allocation site
                        allocation_sites[2].append(line.strip())

    # After finishing the file, write any remaining allocation site
    if allocation_sites:
        allocation_sites_writer.writerow([allocation_sites[0], allocation_sites[1], ' '.join(allocation_sites[2])])

    # Close CSV files
    stores_file.close()
    allocation_sites_file.close()

# Example usage
process_memlog_in_chunks('memlog.log')

Processing lines: 100%|██████████| 1137625979/1137625979 [1:38:20<00:00, 192817.23line/s] 


In [1]:
import csv
from tqdm import tqdm
import bisect
import struct

def hex_to_double(hex_str):
    """
    Convert a hexadecimal string to a double-precision floating-point number.
    
    Parameters:
        hex_str (str): Hexadecimal string representing the double (e.g., '0x3f9c71c71c71c71c').
    
    Returns:
        float: The corresponding double value.
    """
    try:
        # Remove '0x' prefix if present
        hex_str = hex_str.lower().replace('0x', '')
        # Ensure the hex string has exactly 16 characters (8 bytes)
        hex_str = hex_str.zfill(16)
        # Convert hex string to bytes
        byte_data = bytes.fromhex(hex_str)
        # Unpack bytes to double (big-endian)
        double_val = struct.unpack('>d', byte_data)[0]
        return double_val
    except (ValueError, struct.error) as e:
        print(f"Error converting hex to double for '{hex_str}': {e}")
        return None

def hex_to_int(hex_str):
    """
    Convert a hexadecimal string to an integer.
    
    Parameters:
        hex_str (str): Hexadecimal string (e.g., '0x4f990d8').
    
    Returns:
        int: The corresponding integer value.
    """
    try:
        return int(hex_str, 16)
    except ValueError:
        print(f"Invalid hexadecimal address: {hex_str}")
        return None

def load_allocation_sites(allocation_sites_csv):
    """
    Load allocation sites from CSV into a sorted list.
    
    Each allocation site is a dictionary containing:
        - start: Start address (int)
        - end: End address (int)
        - size: Size (int)
        - stack_trace: Stack trace (str)
        - total_stores: Total number of stores (int)
        - zero_values: Number of stores with value 0.0 (int)
        - max_value: Maximum store value (float)
        - min_value: Minimum store value (float)
    
    Parameters:
        allocation_sites_csv (str): Path to allocation_sites.csv
    
    Returns:
        list: Sorted list of allocation sites.
    """
    allocation_sites = []
    with open(allocation_sites_csv, 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            start_hex = row['start'].strip()
            size_str = row['size'].strip()
            stack_trace = row['stack_trace'].strip()
            
            start = hex_to_int(start_hex)
            if start is None:
                continue  # Skip invalid start addresses
            
            try:
                size = int(size_str)
            except ValueError:
                print(f"Invalid size '{size_str}' for start address {start_hex}. Skipping.")
                continue
            
            end = start + size
            allocation_sites.append({
                'start': start,
                'end': end,
                'size': size,
                'stack_trace': stack_trace,
                'total_stores': 0,
                'zero_values': 0,
                'max_value': None,
                'min_value': None
            })
    
    # Sort allocation sites by start address
    allocation_sites.sort(key=lambda x: x['start'])
    return allocation_sites

def find_allocation_site(allocation_sites, address):
    """
    Find the allocation site that contains the given address using binary search.
    
    Parameters:
        allocation_sites (list): Sorted list of allocation sites.
        address (int): The address to find.
    
    Returns:
        dict or None: The allocation site dictionary if found, else None.
    """
    # List of start addresses for binary search
    starts = [site['start'] for site in allocation_sites]
    index = bisect.bisect_right(starts, address) - 1
    if index >= 0 and allocation_sites[index]['start'] <= address < allocation_sites[index]['end']:
        return allocation_sites[index]
    return None

def analyze_allocation_sites_and_stores(stores_csv, allocation_sites_csv, output_csv):
    """
    Analyze stores and allocation sites, computing required statistics.
    
    Parameters:
        stores_csv (str): Path to stores.csv
        allocation_sites_csv (str): Path to allocation_sites.csv
        output_csv (str): Path to output CSV file
    """
    print("Loading allocation sites...")
    allocation_sites = load_allocation_sites(allocation_sites_csv)
    if not allocation_sites:
        print("No valid allocation sites found. Exiting.")
        return
    
    print(f"Total allocation sites loaded: {len(allocation_sites)}")
    
    # Prepare for binary search by ensuring allocation_sites is sorted
    allocation_sites.sort(key=lambda x: x['start'])
    
    # Count total lines in stores.csv for tqdm
    print("Counting total lines in stores.csv for progress tracking...")
    with open(stores_csv, 'r', newline='', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)
    
    print("Processing stores.csv...")
    with open(stores_csv, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader, None)  # Skip header
        if header is None:
            print("stores.csv is empty. Exiting.")
            return
        
        # Determine the delimiter based on the header
        delimiter = ',' if ',' in header[0] else ' '
        
        # Reset the file pointer to start processing
        f.seek(0)
        next(reader)  # Skip header again
        
        # Initialize tqdm with total lines minus header
        for line in tqdm(reader, total=total_lines - 1, desc="Processing stores", unit="store"):
            if not line:
                continue  # Skip empty lines
            
            # Handle different delimiters
            if delimiter == ',':
                if len(line) < 2:
                    print(f"Malformed line (expected 2 columns): {line}")
                    continue
                address_hex, value_hex = line[0].strip(), line[1].strip()
            else:
                if len(line) < 2:
                    print(f"Malformed line (expected 2 columns): {line}")
                    continue
                address_hex, value_hex = line[0].strip(), line[1].strip()
            
            # Convert address to integer
            address = hex_to_int(address_hex)
            if address is None:
                continue  # Skip invalid addresses
            
            # Convert value to double
            value = hex_to_double(value_hex)
            if value is None:
                continue  # Skip invalid value conversions
            
            # Find the corresponding allocation site
            site = find_allocation_site(allocation_sites, address)
            if site:
                # Update statistics
                site['total_stores'] += 1
                if value == 0.0:
                    site['zero_values'] += 1
                if (site['max_value'] is None) or (value > site['max_value']):
                    site['max_value'] = value
                if (site['min_value'] is None) or (value < site['min_value']):
                    site['min_value'] = value
            # Else: Address does not belong to any allocation site; ignore
    
    print("Writing analysis results to CSV...")
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['start', 'size', 'total_stores', 'zero_values', 'max_value', 'min_value', 'stack_trace']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for site in allocation_sites:
            writer.writerow({
                'start': hex(site['start']),
                'size': site['size'],
                'total_stores': site['total_stores'],
                'zero_values': site['zero_values'],
                'max_value': site['max_value'] if site['max_value'] is not None else '',
                'min_value': site['min_value'] if site['min_value'] is not None else '',
                'stack_trace': site['stack_trace']
            })
    
    print(f"Analysis complete. Results saved to {output_csv}.")

analyze_allocation_sites_and_stores(
    stores_csv='stores.csv',
    allocation_sites_csv='allocation_sites.csv',
    output_csv='allocation_analysis.csv'
)

Loading allocation sites...
Total allocation sites loaded: 2210
Counting total lines in stores.csv for progress tracking...
Processing stores.csv...


Processing stores: 100%|██████████| 1137609478/1137609478 [21:04:44<00:00, 14991.35store/s]   

Writing analysis results to CSV...
Analysis complete. Results saved to allocation_analysis.csv.



