In [None]:
!pip install transformers accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
def load_model(model_name="SalesForce/codegen-350M-mono"):
    """
    Load the CodeLlama model and tokenizer, and create a text-generation pipeline.
    Adjust device_map if needed.
    """
    token = os.environ["HF_TOKEN"]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    generator = pipeline("text-generation", model=model, torch_dtype=torch.float16, tokenizer=tokenizer)
    return generator

In [None]:
def benchmark_error_detection(generator, verilog_samples, expected_errors, prompt_template):
    """
    For each Verilog code sample:
    - Format a prompt using the prompt_template.
    - Use the model to generate output.
    - Check if the expected error substring is in the generated text.
    Returns the proportion of samples for which the expected error was found.
    """
    correct = 0
    total = len(verilog_samples)

    for idx, (code, expected_error) in enumerate(zip(verilog_samples, expected_errors)):
        # Create prompt by injecting the Verilog code into the template
        prompt = prompt_template.format(code=code)
        print(f"--- Sample {idx+1} ---")
        print("Prompt:")
        print(prompt)

        # Generate model output
        result = generator(prompt, max_new_tokens=128, do_sample=False, return_full_text=False)
        generated_text = result[0]['generated_text']
        print("Model Output:")
        print(generated_text)

        # Check if expected error substring appears in the output
        if expected_error.lower() in generated_text.lower():
            print("-> Expected error detected.\n")
            correct += 1
        else:
            print("-> Expected error NOT detected.\n")

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [None]:
if __name__ == "__main__":
    # Define a list of Verilog code samples with errors.
        # Specify the CSV file path (CSV should have columns "code" and "error")
    csv_file = "/content/output.csv"  # Change this to your CSV file's path

    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    verilog_samples = df["code"].tolist()
    expected_errors = df["error_type"].tolist()

    # Define the prompt template. {code} will be replaced by the actual Verilog code.
    prompt_template = "Analyze the following Verilog code /n/n{code}/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_or_extra_quote_in_string_literals, missing_or_extra_backtick_in_preprocessor_directives, incomplete_or_extra_array_brackets, malformed_generate_statement, missing_or_extra_comma_in_port_or_sensitivity_list, unclosed_string_in_system_task, malformed_macro_definition, premature_termination, undefined_variable, operator_misuse, redundant_variable_declaration, incorrect_encoding, incorrect_data_type_assignment, port_mode_declaration_error, data_index_out_of_bounds, improper_use_of_keywords, no_error, other_error"

    # Load the model
    generator = load_model()

    # Benchmark error detection
    accuracy = benchmark_error_detection(generator, verilog_samples, expected_errors, prompt_template)
    print(f"Overall Accuracy of Error Detection: {accuracy*100:.2f}%")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/797M [00:00<?, ?B/s]

Some weights of the model checkpoint at SalesForce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

--- Sample 1 ---
Prompt:
Analyze the following Verilog code /n/n
/* Unclosed block comment injection
module RefModule (
  input a,
  input b,
  output q
);

  assign q = a&b;

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_or_extra_quote_in_string_literals, missing_or_extra_backtick_in_preprocessor_directives, incomplete_or_extra_array_brackets, malformed_generate_statement, missing_or_extra_comma_in_port_or_sensitivity_list, unclosed_string_in_system_task, malformed_macro_definition, premature_termination, undefined_variable, operator_misuse, redundant_variable_declaration, incorrect_encoding, incorrect_data_type_assignment, port_mode_d

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endfunction, missing_or_extra_endgenerate, missing_or_extra_endgenerate_endfunction, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing
-> Expected error NOT detected.

--- Sample 2 ---
Prompt:
Analyze the following Verilog code /n/n
modul
endcase // Extra inserted
e RefModule (
  input in,
  output out
);

  assign out = in;

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_bloc

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, missing_or_extra_begin_end, missing_or_extra_begin_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate
-> Expected error detected.

--- Sample 3 ---
Prompt:
Analyze the following Verilog code /n/n
module
endcase // Extra inserted
 RefModule (
  input a,
  input b,
  output out
);

  assign out = ~(a | b);

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, miss

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, missing_or_extra_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, missing_or_extra_quote_in_string_literals, missing_or_extra_backtick_in_preprocessor_directives, missing_or_extra_comma_in_port_or_sensitivity_list, missing_or_extra_comma_in_s
-> Expected error detected.

--- Sample 4 ---
Prompt:
Analyze the following Verilog code /n/n
module assign (
  input a,
  input b,
  output out
);

  assign out = ~(a | b);

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_or_extra_quote_in_string_literals, missing_or_ext

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, missing_or_extra_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, missing_or_extra_quote_in_string_literals, missing_or_extra_backtick_in_preprocessor_directives, missing_or_extra_comma_in_port_or_sensitivity_list, missing_or_extra_comma_in_s
-> Expected error NOT detected.

--- Sample 5 ---
Prompt:
Analyze the following Verilog code /n/n
module if (
  input d,
  input ena,
  output logic q
);

  always@(*) begin
    if (ena)
      q = d;
  end

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_or_extra_quote_in

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, missing_or_extra_begin_end, missing_or_extra_begin_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate, missing_or_extra_begin_end_end_endfunction_endgenerate
-> Expected error NOT detected.

--- Sample 6 ---
Prompt:
Analyze the following Verilog code /n/n
module Re
generate // Missing endgenerate
fModule (
  input a,
  input b,
  output out
);

  assign out = ~(a^b);

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comme

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endfunction, missing_or_extra_endgenerate, missing_or_extra_endcase_endfunction_endgenerate, missing_or_extra_begin_end, missing_or_extra_begin_endmodule, missing_or_extra_begin_endcase_endfunction_endgenerate, missing_or_extra_begin_endmodule, missing_or_extra_begin_endcase_endfunction_endgenerate, missing_or_extra_begin_endmodule, missing_or_
-> Expected error NOT detected.

--- Sample 7 ---
Prompt:
Analyze the following Verilog code /n/n
module Ref)Module (
  input a,
  input b,
  output out
);

  assign out = a & b;

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_or_extra_q

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endfunction, missing_or_extra_endgenerate, missing_or_extra_endgenerate_endfunction, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing
-> Expected error NOT detected.

--- Sample 8 ---
Prompt:
Analyze the following Verilog code /n/n
module Ref)Module (
  input a,
  input b,
  output q
);

  assign q = a&b;

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_block_comment, missing_

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Output:
, missing_or_extra_endmodule, missing_or_extra_endfunction, missing_or_extra_endgenerate, missing_or_extra_endgenerate_endfunction, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing_or_extra_endgenerate_endfunction_endgenerate, missing
-> Expected error NOT detected.

--- Sample 9 ---
Prompt:
Analyze the following Verilog code /n/n
module RefMo
endfunction // Extra inserted
dule (
  input in,
  output out
);

  assign out = in;

endmodule

/n and provide exactly one error type from the list below. Your response must be one word only, nothing else: missing_semicolon, mismatched_parentheses, missing_or_extra_endmodule, missing_or_extra_endcase_endfunction_endgenerate, mismatched_begin_end, using_reserved_keyword_as_identifier, missing_or_extra_colon_in_case_item, malformed_timescale_directive, unclosed_