In [130]:
# Cell 1: Imports and Setup
import json
import re
from pathlib import Path
from typing import List, Dict

print("✓ Imports loaded")

✓ Imports loaded


In [131]:
# Cell 2: Updated Violation Finder with Better Patterns
def find_lowercase_suffix_violations(code: str) -> List[Dict]:
    """
    Find literals with lowercase suffixes (u, l, f, ul, ll, etc.)
    
    UPDATED: More comprehensive pattern matching
    """
    violations = []
    
    # More comprehensive patterns
    patterns = [
        # Hexadecimal with lowercase suffixes
        (r'\b0x[0-9a-fA-F]+u\b', 'hex_u'),
        (r'\b0x[0-9a-fA-F]+ul\b', 'hex_ul'),
        (r'\b0x[0-9a-fA-F]+ull\b', 'hex_ull'),
        (r'\b0x[0-9a-fA-F]+l\b', 'hex_l'),
        (r'\b0x[0-9a-fA-F]+ll\b', 'hex_ll'),
        (r'\b0x[0-9a-fA-F]+lu\b', 'hex_lu'),          # Alternative order
        (r'\b0x[0-9a-fA-F]+llu\b', 'hex_llu'),        # Alternative order
        
        # Decimal with lowercase suffixes
        (r'\b\d+u\b', 'dec_u'),
        (r'\b\d+ul\b', 'dec_ul'),
        (r'\b\d+ull\b', 'dec_ull'),
        (r'\b\d+l\b', 'dec_l'),
        (r'\b\d+ll\b', 'dec_ll'),
        (r'\b\d+lu\b', 'dec_lu'),
        (r'\b\d+llu\b', 'dec_llu'),
        
        # Floats with lowercase suffixes
        (r'\b\d+\.\d+f\b', 'float_f'),
        (r'\b\d+\.\d*f\b', 'float_f_alt'),             # e.g., 1.f
        (r'\b\d*\.\d+f\b', 'float_f_alt2'),            # e.g., .5f
        (r'\b\d+f\b', 'int_f'),
        (r'\b\d+\.\d+l\b', 'float_l'),
        (r'\b\d+\.?\d*e[+-]?\d+f\b', 'scientific_f'),  # Scientific notation
        
        # Octal with lowercase suffixes (starts with 0)
        (r'\b0[0-7]+u\b', 'oct_u'),
        (r'\b0[0-7]+ul\b', 'oct_ul'),
        (r'\b0[0-7]+l\b', 'oct_l'),
    ]
    
    for pattern, viol_type in patterns:
        for match in re.finditer(pattern, code, re.IGNORECASE):
            literal = match.group(0)
            
            # Check if it actually has lowercase suffix
            # (pattern is case-insensitive, so we need to verify)
            if literal == literal.upper():
                continue  # Skip if already uppercase
            
            # Check if suffix is lowercase
            has_lowercase_suffix = False
            for suffix_char in ['u', 'l', 'f']:
                if suffix_char in literal.lower() and suffix_char in literal:
                    has_lowercase_suffix = True
                    break
            
            if not has_lowercase_suffix:
                continue
            
            line_num = code[:match.start()].count('\n') + 1
            
            violations.append({
                'literal': literal,
                'fixed': literal.upper(),
                'type': viol_type,
                'line': line_num
            })
    
    return violations

print("✓ Updated violation finder function defined")

✓ Updated violation finder function defined


In [132]:
# Cell 3: Test the Finder (Optional - for verification)
test_code = """
void example() {
    unsigned int a = 0x1234u;     // VIOLATION
    long b = 100l;                 // VIOLATION  
    unsigned long c = 0xFFul;      // VIOLATION
    float d = 1.5f;                // VIOLATION
    
    unsigned int good = 0x1234U;   // OK
    long good2 = 100L;             // OK
}
"""

violations = find_lowercase_suffix_violations(test_code)

print("Test Results:")
print(f"Found {len(violations)} violations:\n")
for v in violations:
    print(f"  Line {v['line']}: {v['literal']} → {v['fixed']} ({v['type']})")

Test Results:
Found 7 violations:

  Line 3: 0x1234u → 0X1234U (hex_u)
  Line 5: 0xFFul → 0XFFUL (hex_ul)
  Line 4: 100l → 100L (dec_l)
  Line 6: 1.5f → 1.5F (float_f)
  Line 6: 1.5f → 1.5F (float_f_alt)
  Line 6: 1.5f → 1.5F (float_f_alt2)
  Line 6: 5f → 5F (int_f)


In [133]:
# Cell 4: Universal Scanner - Handles both JSON formats
def scan_primevul_dataset(json_file: str) -> List[Dict]:
    """
    Scan PrimeVul dataset and extract entries with MISRA 2-13-4 violations
    Handles both JSON array format and newline-delimited JSON
    
    Args:
        json_file: Path to PrimeVul JSON file
        
    Returns:
        List of examples with violations
    """
    examples = []
    
    print(f"Scanning {json_file} for MISRA 2-13-4 violations...")
    print("Rule: Literal suffixes shall be upper case\n")
    
    # First, try to load as complete JSON array
    try:
        with open(json_file, 'r', encoding='utf-8', errors='ignore') as f:
            data = json.load(f)
            
        print(f"Loaded JSON array with {len(data)} entries")
        
        for idx, entry in enumerate(data):
            # Get the function code
            code = entry.get('func', '')
            if not code:
                continue
            
            # Find violations
            violations = find_lowercase_suffix_violations(code)
            
            if violations:
                examples.append({
                    'idx': entry.get('idx'),
                    'project': entry.get('project'),
                    'file_name': entry.get('file_name'),
                    'cve': entry.get('cve'),
                    "cve_desc": entry.get('cve_desc'),
                    'commit_url': entry.get('commit_url'),
                    'violations': violations,
                    'code': code
                })
                
                if len(examples) % 10 == 0:
                    print(f"  Found {len(examples)} examples so far...")
            
            if (idx + 1) % 500 == 0:
                print(f"  Processed {idx + 1} entries...")
    
    except json.JSONDecodeError:
        # If that fails, try newline-delimited JSON
        print("JSON array format failed, trying newline-delimited format...")
        
        with open(json_file, 'r', encoding='utf-8', errors='ignore') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                
                if not line or line in ['[', ']']:
                    continue
                
                # Remove trailing comma
                if line.endswith(','):
                    line = line[:-1]
                
                try:
                    entry = json.loads(line)
                    
                    code = entry.get('func', '')
                    if not code:
                        continue
                    
                    violations = find_lowercase_suffix_violations(code)
                    
                    if violations:
                        examples.append({
                            'idx': entry.get('idx'),
                            'project': entry.get('project'),
                            'file_name': entry.get('file_name'),
                            'cve': entry.get('cve'),
                            'commit_url': entry.get('commit_url'),
                            'violations': violations,
                            'code': code
                        })
                        
                        if len(examples) % 10 == 0:
                            print(f"  Found {len(examples)} examples so far...")
                    
                    if line_num % 500 == 0:
                        print(f"  Processed {line_num} lines...")
                            
                except json.JSONDecodeError:
                    continue
    
    print(f"\n✓ Scan complete!")
    print(f"✓ Found {len(examples)} examples with violations")
    
    return examples

print("✓ Scanner function defined")

✓ Scanner function defined


In [134]:
# Cell 5: Run the Scanner
# UPDATE THIS PATH to your PrimeVul dataset file
dataset_file = 'primevul_valid_paired.jsonl'  # Change this to your file path

examples = scan_primevul_dataset(dataset_file)

Scanning primevul_valid_paired.jsonl for MISRA 2-13-4 violations...
Rule: Literal suffixes shall be upper case

JSON array format failed, trying newline-delimited format...
  Found 10 examples so far...
  Processed 500 lines...
  Found 20 examples so far...

✓ Scan complete!
✓ Found 25 examples with violations


In [135]:
# Cell 6: Display Summary Statistics
print("="*70)
print("SUMMARY STATISTICS")
print("="*70)

# Count violation types
violation_types = {}
for ex in examples:
    for v in ex['violations']:
        vtype = v['type']
        violation_types[vtype] = violation_types.get(vtype, 0) + 1

print("\nViolation Types Distribution:")
for vtype, count in sorted(violation_types.items(), key=lambda x: x[1], reverse=True):
    print(f"  {vtype:15s}: {count:4d}")

print(f"\nTotal Examples: {len(examples)}")
print(f"Total Violations: {sum(len(ex['violations']) for ex in examples)}")
# print(f"Avg Violations per Example: {sum(len(ex['violations']) for ex in examples) / len(examples):.2f}")

SUMMARY STATISTICS

Violation Types Distribution:
  int_f          :  102
  float_f_alt    :   54
  float_f        :   50
  float_f_alt2   :   50
  dec_u          :    9
  hex_l          :    4
  oct_u          :    4
  hex_ull        :    2
  dec_ll         :    2

Total Examples: 25
Total Violations: 277


In [136]:
# Cell 7: View Sample Examples
print("="*70)
print("SAMPLE EXAMPLES (First 5)")
print("="*70)

for i, ex in enumerate(examples[:5], 1):
    print(f"\n{'─'*70}")
    print(f"Example #{i}")
    print(f"{'─'*70}")
    print(f"Project: {ex.get('project', 'Unknown')}")
    print(f"File: {ex.get('file_name', 'N/A')}")
    print(f"CVE: {ex.get('cve', 'N/A')}")
    
    if ex.get('commit_url'):
        print(f"URL: {ex['commit_url']}")
    
    print(f"\nViolations ({len(ex['violations'])}):")
    for v in ex['violations']:
        print(f"  Line {v['line']:4d}: {v['literal']:15s} → {v['fixed']:15s} [{v['type']}]")

SAMPLE EXAMPLES (First 5)

──────────────────────────────────────────────────────────────────────
Example #1
──────────────────────────────────────────────────────────────────────
Project: gpac
File: filedump.c
CVE: CVE-2021-32138
URL: https://github.com/gpac/gpac/commit/289ffce3e0d224d314f5f92a744d5fe35999f20b

Violations (4):
  Line 1015: 2f              → 2F              [int_f]
  Line 1015: 2f              → 2F              [int_f]
  Line 1017: 2f              → 2F              [int_f]
  Line 1017: 2f              → 2F              [int_f]

──────────────────────────────────────────────────────────────────────
Example #2
──────────────────────────────────────────────────────────────────────
Project: gpac
File: filedump.c
CVE: CVE-2021-32138
URL: https://github.com/gpac/gpac/commit/289ffce3e0d224d314f5f92a744d5fe35999f20b

Violations (4):
  Line 1019: 2f              → 2F              [int_f]
  Line 1019: 2f              → 2F              [int_f]
  Line 1021: 2f              → 2F   

In [137]:
# Cell 7.5: Extract IDX values into a list
if len(examples) == 0:
    print("No violations found - idx list is empty")
    idx_list = []
else:
    idx_list = [ex['idx'] for ex in examples]
    
    print(f"Extracted {len(idx_list)} idx values")
    print(idx_list[:])
    
    print(f"\nSample idx values with details:")
    for i in range(min(5, len(examples))):
        print(f"  idx: {idx_list[i]} - {examples[i].get('project')} ({len(examples[i]['violations'])} violations)")

# The idx_list variable now contains all idx values
print(f"\n✓ idx_list created with {len(idx_list)} values")

Extracted 25 idx values
[196719, 243213, 197575, 261456, 197652, 262684, 198110, 269121, 326145, 204281, 356273, 206645, 384338, 207309, 395532, 208514, 412024, 208675, 413449, 210612, 438548, 212695, 460737, 216804, 506600]

Sample idx values with details:
  idx: 196719 - gpac (4 violations)
  idx: 243213 - gpac (4 violations)
  idx: 197575 - tensorflow (4 violations)
  idx: 261456 - tensorflow (4 violations)
  idx: 197652 - FFmpeg (2 violations)

✓ idx_list created with 25 values


In [138]:
# Cell 8: View Code for a Specific Example (Updated)
if len(examples) == 0:
    print("No violations found in the dataset.")
    print("Cannot display examples - the list is empty.")
else:
    # Change the index to view different examples
    example_index = 0  # View first example
    
    if example_index >= len(examples):
        print(f"Error: Index {example_index} is out of range.")
        print(f"Valid range: 0 to {len(examples) - 1}")
    else:
        ex = examples[example_index]
        
        print(f"Example: {ex.get('project', 'Unknown')} - {ex.get('file_name', 'N/A')}")
        print(f"CVE: {ex.get('cve', 'N/A')}")
        print(f"\nViolations:")
        for v in ex['violations']:
            print(f"  Line {v['line']}: {v['literal']} → {v['fixed']}")
        
        print("\n" + "="*70)
        print("CODE:")
        print("="*70)
        print(ex['code'])

Example: gpac - filedump.c
CVE: CVE-2021-32138

Violations:
  Line 1015: 2f → 2F
  Line 1015: 2f → 2F
  Line 1017: 2f → 2F
  Line 1017: 2f → 2F

CODE:
void gf_inspect_format_timecode(const u8 *data, u32 size, u32 tmcd_flags, u32 tc_num, u32 tc_den, u32 tmcd_fpt, char szFmt[100]);

void DumpTrackInfo(GF_ISOFile *file, GF_ISOTrackID trackID, Bool full_dump, Bool is_track_num, Bool dump_m4sys)
{
	char szCodec[RFC6381_CODEC_NAME_SIZE_MAX];
	Double scale, max_rate, rate;
	Bool is_od_track = 0;
	u32 trackNum, i, j, ts, mtype, msub_type, timescale, sr, nb_ch, count, alt_group, nb_groups, nb_edits, cdur, csize, bps, pfmt, codecid;
	u64 time_slice, dur, size;
	s32 cts_shift;
	GF_ESD *esd;
	char szDur[50];
	char *lang;

	if (!is_track_num) {
		trackNum = gf_isom_get_track_by_id(file, trackID);
	} else {
		trackNum = trackID;
		trackID = gf_isom_get_track_id(file, trackNum);
	}
	if (!trackNum) {
		M4_LOG(GF_LOG_ERROR, ("No track with ID %d found\n", trackID));
		return;
	}

	timescale = gf_isom_g

In [139]:
# Cell 9: Filter Examples by Criteria
# Example: Filter by specific violation type
def filter_examples(examples, violation_type=None, min_violations=1, has_cve=False):
    """Filter examples by criteria"""
    filtered = []
    
    for ex in examples:
        # Check violation count
        if len(ex['violations']) < min_violations:
            continue
        
        # Check CVE
        if has_cve and not ex.get('cve'):
            continue
        
        # Check violation type
        if violation_type:
            types_in_example = [v['type'] for v in ex['violations']]
            if violation_type not in types_in_example:
                continue
        
        filtered.append(ex)
    
    return filtered

# Example usage:
hex_examples = filter_examples(examples, violation_type='hex_ul')
print(f"Examples with 'hex_ul' violations: {len(hex_examples)}")

cve_examples = filter_examples(examples, has_cve=True)
print(f"Examples with CVE: {len(cve_examples)}")

multiple_violations = filter_examples(examples, min_violations=3)
print(f"Examples with 3+ violations: {len(multiple_violations)}")

Examples with 'hex_ul' violations: 0
Examples with CVE: 25
Examples with 3+ violations: 15


In [140]:
len(examples)

25

In [125]:
# # Cell 10: Save Examples to JSON (Optional)
# import json

# output_file = 'misra_2_13_4_test_pair.json'

# with open(output_file, 'w') as f:
#     json.dump(examples, f, indent=2)

# print(f"✓ Saved {len(examples)} examples to: {output_file}")

In [126]:
examples

[{'idx': 195095,
  'project': 'e2guardian',
  'file_name': 'Socket.cpp',
  'cve': 'CVE-2021-44273',
  'commit_url': 'https://github.com/e2guardian/e2guardian/commit/eae46a7e2a57103aadca903c4a24cca94dc502a2',
  'violations': [{'literal': '130l',
    'fixed': '130L',
    'type': 'dec_l',
    'line': 23},
   {'literal': '1l', 'fixed': '1L', 'type': 'dec_l', 'line': 97},
   {'literal': '1l', 'fixed': '1L', 'type': 'dec_l', 'line': 98}],
  'code': 'int Socket::startSslClient(const std::string &certificate_path, String hostname)\n{\n    if (isssl) {\n        stopSsl();\n    }\n\n    ERR_clear_error();\n#if OPENSSL_VERSION_NUMBER < 0x10100000L\n    ctx = SSL_CTX_new(SSLv23_client_method());\n#else\n    ctx = SSL_CTX_new(TLS_client_method());\n#endif\n\n    if (ctx == NULL) {\n#ifdef NETDEBUG\n        std::cout << thread_id << "Error ssl context is null (check that openssl has been inited)" << std::endl;\n#endif\n        log_ssl_errors("Error ssl context is null for %s", hostname.c_str());\n  

In [141]:
valid_examples = examples.copy()

In [142]:
examples = test_examples.extend(valid_examples)

In [144]:
examples = train_examples + test_examples + valid_examples

In [145]:
len(examples)

311

In [148]:
# Cell 10: Save Examples to JSON (Optional)
import json

output_file = 'misra_2_13_4_all_pairs.json'

with open(output_file, 'w') as f:
    json.dump(examples, f, indent=2)

print(f"✓ Saved {len(examples)} examples to: {output_file}")

✓ Saved 311 examples to: misra_2_13_4_all_pairs.json


In [146]:
examples[0]

{'idx': 9,
 'project': 'ghostscript',
 'file_name': 'None',
 'cve': 'CVE-2018-1000037',
 'commit_url': 'http://git.ghostscript.com/?p=mupdf.git;a=commitdiff;h=b2e7d38e845c7d4922d05e6e41f3a2dc1bc1b14a;hp=f51836b9732c38d945b87fda0770009a77ba680c',
 'violations': [{'literal': '1.0f',
   'fixed': '1.0F',
   'type': 'float_f',
   'line': 92},
  {'literal': '1.0f', 'fixed': '1.0F', 'type': 'float_f_alt', 'line': 92},
  {'literal': '1.0f', 'fixed': '1.0F', 'type': 'float_f_alt2', 'line': 92},
  {'literal': '0f', 'fixed': '0F', 'type': 'int_f', 'line': 92}],
 'code': " pdf_show_image(fz_context *ctx, pdf_run_processor *pr, fz_image *image)\n {\n        pdf_gstate *gstate = pr->gstate + pr->gtop;\n        fz_matrix image_ctm;\n        fz_rect bbox;\n       softmask_save softmask = { NULL };\n \n        if (pr->super.hidden)\n                return;\n\t\t\tbreak;\n\t\tcase PDF_MAT_SHADE:\n\t\t\tif (gstate->fill.shade)\n\t\t\t{\n\t\t\t\tfz_clip_image_mask(ctx, pr->dev, image, &image_ctm, &bbox);\

In [147]:
examples

[{'idx': 9,
  'project': 'ghostscript',
  'file_name': 'None',
  'cve': 'CVE-2018-1000037',
  'commit_url': 'http://git.ghostscript.com/?p=mupdf.git;a=commitdiff;h=b2e7d38e845c7d4922d05e6e41f3a2dc1bc1b14a;hp=f51836b9732c38d945b87fda0770009a77ba680c',
  'violations': [{'literal': '1.0f',
    'fixed': '1.0F',
    'type': 'float_f',
    'line': 92},
   {'literal': '1.0f', 'fixed': '1.0F', 'type': 'float_f_alt', 'line': 92},
   {'literal': '1.0f', 'fixed': '1.0F', 'type': 'float_f_alt2', 'line': 92},
   {'literal': '0f', 'fixed': '0F', 'type': 'int_f', 'line': 92}],
  'code': " pdf_show_image(fz_context *ctx, pdf_run_processor *pr, fz_image *image)\n {\n        pdf_gstate *gstate = pr->gstate + pr->gtop;\n        fz_matrix image_ctm;\n        fz_rect bbox;\n       softmask_save softmask = { NULL };\n \n        if (pr->super.hidden)\n                return;\n\t\t\tbreak;\n\t\tcase PDF_MAT_SHADE:\n\t\t\tif (gstate->fill.shade)\n\t\t\t{\n\t\t\t\tfz_clip_image_mask(ctx, pr->dev, image, &image_