In [None]:
# TODO
#
# Consider changing 'search'es to 'findall's for efficiency when it's known that a match occurs, or even if it's not known and then take care of it that way
#
# Need to be able to handle module declaration format of
#   ./FilesToParse/minispartan6-audio/src_v/core_soc/spi_lite.v
#
# Check other TODO notes

In [None]:
# Hide the output of this cell
%%capture

import os

# Remove the folder if it's already there, then make the folder and go into it
! rm -rf FilesToParse
! mkdir FilesToParse
os.chdir('FilesToParse')

# These are a bunch of example repos I found that have some Verilog files in them
! git clone https://github.com/sudhamshu091/32-Verilog-Mini-Projects.git
! git clone https://github.com/snbk001/Verilog-Design-Examples.git
! git clone https://github.com/ashishrana160796/verilog-starter-tutorials.git
! git clone https://github.com/mongrelgem/Verilog-Adders.git
! git clone https://github.com/mihir8181/VerilogHDL-Codes.git
! git clone https://github.com/sudhamshu091/Single-Cycle-Risc-Processor-32-bit-Verilog.git
! git clone https://github.com/ultraembedded/minispartan6-audio.git

# Move back to the starting directory to continue with program
os.chdir('..')

# Remove the sample directory of files that Colab spawns every time we open it again
! rm -rf sample_data

In [176]:
import re

keywords = ['input', 'output', 'reg', 'wire']

def clean_lists(list_):
    whole = ''.join(letter for letter in list_)                        # the list returned from group matching in re search was a list of single characters
    elements = [e.strip() for e in re.split(',|\)|;', whole) if e]     # split the string to create a list, then strip each item in the list
    for e in elements:
        if not e:
            elements.remove(e)                                        # get rid of empty strings
    return elements


def print_keyword(keyword, keyword_list):
    print('    {} ({}):'.format(keyword, len(clean_lists(keyword_list[keywords.index(keyword)]))), file=output_file)
    for i in clean_lists(keyword_list[keywords.index(keyword)]):
        print('      ', i, sep='', file=output_file)


def check_if_in_module(in_module, keyword_list):
    if in_module:                                    # this checks to see if we're already in a module, in case the previous module did not have had an 'endmodule'
        for keyword in keywords:
            print_keyword(keyword, keyword_list)
    return True


# This function is used for the cases in which a bit width is specified for some inputs,
#   but only noted once, resulting in the need for it to be distributed to each element for display
def bit_width_distributor(elements, bit_width_specifier):
    elements = elements.strip(',')                       # get rid of any commas at the end that may add a blank element when splitting
    s = elements.split(',')
    for i in range(1, len(s)):
        s[i] = bit_width_specifier + s[i]
    elements_with_bit_width = ''
    for string in s:
        elements_with_bit_width += ' ' + string + ','    # add a comma after each one because that's what we split by when cleaning the list
    return elements_with_bit_width


def search_module_dec(line, keyword_list):
    inputs, outputs = [], []
    inputs_string, outputs_string = '', ''

#TODO: probably here and in the bit width thing I can condense it into a for loop with an iteration for input and output
#      (then I could add other keywords if it turns out others can exist in the module declaration)
#
# (I did this and made it the first change in V5 of the program, so see that one)

    if re.search('input', line):
        inputs = re.findall(r'\binput\b\s*(.*?)\s*(?=\binput\b|\boutput\b|$)', line)
    if re.search('output', line):
        outputs = re.findall(r'\boutput\b\s*(.*?)\s*(?=\binput\b|\boutput\b|$)', line)

    for i in inputs:
        bit_width = re.search('\[\d+:\d+\]', i)                     # regex to check for bit width indicator ([number:number])
        if bit_width:
            bit_width_specifier = bit_width.group()                 # if the matched group contains a bit width indicator
            elements_with_bit_width = bit_width_distributor(i, bit_width_specifier)
            inputs[inputs.index(i)] = elements_with_bit_width

    for o in outputs:
        bit_width = re.search('\[\d+:\d+\]', o)                     # regex to check for bit width indicator ([number:number])
        if bit_width:
            bit_width_specifier = bit_width.group()                 # if the matched group contains a bit width indicator
            elements_with_bit_width = bit_width_distributor(o, bit_width_specifier)
            outputs[outputs.index(o)] = elements_with_bit_width

    inputs_string = ''.join(inputs)
    outputs_string = ''.join(outputs)

    keyword_list[keywords.index('input')].append(inputs_string)
    keyword_list[keywords.index('output')].append(outputs_string)

    return keyword_list


def parse_lines(lines, output_file):
    in_comment_block = False
    in_module = False
    in_module_dec = False       # module declaration example: module RCA4(output [3:0] sum, output cout, input [3:0] a, b, input cin);

    keyword_list = [[] for keyword in keywords]

    # Search through each line

    for line in lines:

        line = line.split('//')[0]              # remove any in-line comments
        if '=' in line:
            line = line.split('=')[0]           # this will get rid of the value part any assignments, such as
                                                #   wire read_en_w  = cfg_arvalid_i & cfg_arready_o;
                                                #   - currently we do not keep track of assignments, so this is fine
            line += ';'                         # this will make sure the element is treated as its own thing and not part of the next

        # Look for comment blocks
        if re.search('\s/\*', line):            # look for /* at the beginning of the text in the line
            in_comment_block = True             # we are now in a comment block
            continue                            # go to the next line

        if in_comment_block:                    # if currently in a comment block
            if re.search('\s\*/', line):        # end of the comment block
                in_comment_block = False        # we are no longer in the comment block
            continue                            # go to the next line

        # The first line this checks is the second line of the module declaration
        if in_module_dec:
            keyword_list = search_module_dec(line, keyword_list)
            if re.search('\);$', line):         # if the line ends with );
                in_module_dec = False           # we are no longer in the module declaration
            continue                            # go to the next line

        # Search based on module declaration format 1:
        #   1: if there is at least one named listing of inputs or outputs in the module declaration
        match_ = re.search('module.*input|module.*output', line)

        if match_:
            in_module = check_if_in_module(in_module, keyword_list)
            module_name = re.search('module (\w+)', line).group(1)
            print("  Module:", module_name, file=output_file)
            keyword_list = search_module_dec(line, keyword_list)
            if not re.search('\);$', line):       # if there is no closing parenthesis in the module declaration - it spans multiple lines
                in_module_dec = True
            continue

        # Search based on module declaration format 2 or 3:
        #   2: if if there are variable names in the module declaration but they're not specified as input or output
        #   3: if there is only an open parenthesis (no variable names) in the module declaration
        elif re.search(r'\bmodule\b', line):
            in_module = check_if_in_module(in_module, keyword_list)
            module_name = re.search('module (\w+)', line).group(1)
            print("  Module:", module_name, file=output_file)

        elif re.search('endmodule', line):    # end of a module
            in_module = False
            for keyword in keywords:
                print_keyword(keyword, keyword_list)
            keyword_list = [[] for keyword in keywords]


        for keyword in keywords:
            if re.search(keyword, line):
                regex = '\A\s*' + keyword + '\s+(.*)'                           # regex to find keyword
                match_ = re.search(regex, line)                                 # search for the keyword in the line
                if match_:
                    elements = match_.group(1)
                    bit_width = re.search('\[\d+:\d+\]', elements)              # regex to check for bit width indicator ([number:number])
                    if bit_width:
                        bit_width_specifier = bit_width.group()                 # if the matched group contains a bit width indicator
                        elements_with_bit_width = bit_width_distributor(elements, bit_width_specifier)
                        keyword_list[keywords.index(keyword)].append(elements_with_bit_width)
                    else:
                        keyword_list[keywords.index(keyword)].append(elements)

    # Print the inputs and outputs if we have gotten to the end of the file and there was only one module but it had no 'endmodule'
    check_if_in_module(in_module, keyword_list)

In [177]:
parsed_files_filename = 'parsed_file_output2.txt'
output_file = open(parsed_files_filename, 'w')

def traverse_directory(directory, extension, output_file):
    files_parsed = 0
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)

        if os.path.isfile(filepath) and filename.endswith(extension):
            file = open(filepath, 'r')
            lines = file.readlines()
            print("File:", filepath, file=output_file)
            parse_lines(lines, output_file)
            print(file=output_file)                                     # print a new line after the information for the current file
            file.close()
            files_parsed += 1
        elif os.path.isdir(filepath):
            files_parsed += traverse_directory(filepath, extension, output_file)

    return files_parsed

files_parsed = traverse_directory(".", ".v", output_file)
output_file.close()
print("Files parsed:", files_parsed)
print("Parsed output has been saved to {}".format(parsed_files_filename))

Files parsed: 287
Parsed output has been saved to parsed_file_output2.txt


In [None]:
# Cell to parse the lines of just one file

parsed_files_filename = 'single_file_parsed_output.txt'
output_file = open(parsed_files_filename, 'w')

def traverse_directory(directory, extension, output_file):
    files_parsed = 0
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)

        if os.path.isfile(filepath) and filename.endswith(extension):
            file = open(filepath, 'r')
            lines = file.readlines()
            print("File:", filepath, file=output_file)
            parse_lines(lines, output_file)
            print(file=output_file)                                     # print a new line after the information for the current file
            file.close()
            files_parsed += 1
        elif os.path.isdir(filepath):
            files_parsed += traverse_directory(filepath, extension, output_file)

    return files_parsed

files_parsed = traverse_directory("/content/FilesToParse/Verilog-Adders/Carry Skip Adder", ".v", output_file)
output_file.close()
print("Files parsed:", files_parsed)
print("Parsed output has been saved to {}".format(parsed_files_filename))