In [15]:
import pandas as pd
import numpy as np
import struct
from tqdm import tqdm

class MemoryAccess:
    def __init__(self, address, value_hex):
        self.address = int(address, 16)
        self.value_hex = value_hex
        # Convert hex string to int, then to bytes, then to double
        self.value_double = struct.unpack('d', struct.pack('Q', int(value_hex, 16)))[0]

class AllocationSite:
    def __init__(self, start_address, size):
        self.start_address = int(start_address, 16)
        self.size = int(size)
        self.end_address = self.start_address + self.size
        self.stack_trace = []
        self.stores = []
    
    def contains_address(self, address):
        return self.start_address <= address < self.end_address

def parse_memlog(filename):
    memory_stores = []
    allocation_sites = []
    current_allocation = None
    
    # Count lines for progress bar
    total_lines = sum(1 for _ in open(filename))
    
    with open(filename) as f:
        with tqdm(total=total_lines, desc="Processing logs") as pbar:
            # Step 1: Skip until "Parent PID"
            for line in f:
                pbar.update(1)
                if "Parent PID" in line:
                    next(f)  # Skip the Parent PID line
                    break
            
            # Step 2: Parse memory stores
            for line in f:
                pbar.update(1)
                line = line.strip()
                if line == "=== Allocation sites ===":
                    break
                    
                # Parse memory access line
                try:
                    addr, value = line.split()
                    memory_stores.append(MemoryAccess(addr, value))
                except ValueError:
                    continue  # Skip lines that don't match the format
            
            # Step 3: Parse allocation sites
            for line in f:
                pbar.update(1)
                pbar.update(1)
                line = line.strip()
                
                # Stop at heap summary
                if "HEAP SUMMARY" in line:
                    break
                    
                # New allocation site
                if line.startswith("Start"):
                    parts = line.split(",")
                    addr = parts[0].split()[1]
                    size = parts[1].split()[1]
                    current_allocation = AllocationSite(addr, size)
                    allocation_sites.append(current_allocation)
                
                # Stack trace line
                elif line.startswith("==") and "by" in line:
                    if current_allocation is not None:
                        # Remove the valgrind prefix and add to stack trace
                        trace = line.split("by", 1)[1].strip()
                        current_allocation.stack_trace.append(trace)
    
    print("\nGrouping memory stores by allocation site...")
    # Step 4: Group memory stores by allocation site
    for access in tqdm(memory_stores, desc="Classifying stores"):
        for site in allocation_sites:
            if site.contains_address(access.address):
                site.stores.append(access)
                break
    
    # Step 5: Calculate statistics
    results = []
    for site in allocation_sites:
        if not site.stores:  # Skip sites with no stores
            continue
            
        values = [store.value_double for store in site.stores]
        
        result = {
            'allocation_start': f"0x{site.start_address:x}",
            'size': site.size,
            'total_stores': len(site.stores),
            'zero_values': sum(1 for v in values if v == 0),
            'min_value': min(values),
            'max_value': max(values),
            'stack_trace': '\n'.join(site.stack_trace)
        }
        results.append(result)
    
    return pd.DataFrame(results)

import bz2
import os

input_file = 'memlog.log.bz2'
output_file = 'memlog.log'

# Determine the total size of the compressed file for progress tracking
file_size = os.path.getsize(input_file)

# Open the compressed file in read mode and the output file in write mode
with bz2.BZ2File(input_file, 'rb') as bz2_file, open(output_file, 'wb') as out_file, tqdm(
    total=file_size, unit='B', unit_scale=True, desc='Decompressing logs') as pbar:
    # Read and write in chunks
    while chunk := bz2_file.read(1024):  # Adjust chunk size if needed
        out_file.write(chunk)
        pbar.update(len(chunk))

# Run the analysis
df = parse_memlog('memlog.log')

# Display results
print("\nMemory Stores Analysis Results:")
print("===============================")
for idx, row in df.iterrows():
    print(f"\nAllocation Site at {row['allocation_start']} (size: {row['size']} bytes)")
    print(f"Stack trace:\n{row['stack_trace']}")
    print(f"\nStatistics:")
    print(f"  Total stores: {row['total_stores']}")
    print(f"  Zero values: {row['zero_values']} ({row['zero_values']/row['total_stores']*100:.1f}%)")
    print(f"  Min value: {row['min_value']}")
    print(f"  Max value: {row['max_value']}")

Decompressing logs: 1.04GB [00:10, 103MB/s]                            
Processing logs: 36749404it [00:44, 822854.32it/s]                               



Grouping memory stores by allocation site...


Classifying stores: 100%|██████████| 36749312/36749312 [00:08<00:00, 4511603.46it/s]



Memory Stores Analysis Results:

Allocation Site at 0x11c18040 (size: 214400000 bytes)
Stack trace:
0x109431: LBM_allocateGrid (lbm.c:26)
0x10C00F: MAIN_initialize (main.c:126)
0x109262: main (main.c:34)

Statistics:
  Total stores: 30549288
  Zero values: 1340000 (4.4%)
  Min value: 0.0
  Max value: 0.3333333333333334

Allocation Site at 0x4fa0040 (size: 214400000 bytes)
Stack trace:
0x109431: LBM_allocateGrid (lbm.c:26)
0x10C003: MAIN_initialize (main.c:125)
0x109262: main (main.c:34)

Statistics:
  Total stores: 6200024
  Zero values: 1340000 (21.6%)
  Min value: 0.0
  Max value: 0.3333333333333333
