In [12]:
import os
import pandas as pd
import re

In [21]:
def abstract_out_code_section(markdown_text):
    # Remove Github-flavored markdown for syntax-highlighted code. E.g. ```obj-c .... ```
    # Add padding to prevent code section adjacent to a word from becoming @code_sectionword
    # Also handles case where there's space/tab before the ```
    # Use negative lookahead to avoid matching malformed block, e.g. `````` and  to avoid inline code followed by new line (e.g. ```xyz```
    # text
    # ```abc```)
    # from matching ```text```
    abstracted_text = re.sub(r'(?<!(```))(?<![a-z])```[ \t]{0,1}[a-z\.\-,\_]*[\s]*\n[\s\S]*?\n[\s]*(?<!(```))```',' @abstr_code_section ',markdown_text,0,re.DOTALL)
    return abstracted_text



def extract_headings_from_files_in_directory(target_readme_file_dir):
    overview = pd.DataFrame(columns=['section_id', 'file_id', 'url', 'heading_markdown', 'local_readme_file', 'heading_level'])
    
    file_id = 1
    for filename in os.listdir(target_readme_file_dir):
        # Used to construct the repo URL later
        filename_w_o_ext = os.path.splitext(filename)[0]
        s = filename_w_o_ext.split('.',1)
        username = s[0]
        repo_name = s[1]
        url = 'https://github.com/{0}/{1}'.format(username, repo_name)
        
        section_id = 1
        with open(target_readme_file_dir + filename, 'r', encoding='utf-8', errors='backslashreplace') as f:
            
          
                content = f.read()
                # Perform abstraction on code section only before checking for potential headings
                # This is to reduce possibility of code snippets starting with '#' being read as potential headings
                # without changing genuine heading that happens to contain numbers or other things, e.g. '# Section 1'
                content = abstract_out_code_section(content)
                content_lines = content.splitlines()
                
                # Start at first nonempty line index
                curr_filename_line_number = next(i for i, j in enumerate(content_lines) if j)
                
                while (curr_filename_line_number<len(content_lines)):
                    found_candidate_heading = False
                    line = content_lines[curr_filename_line_number]
                    
                    if line.startswith('#'):
                        heading_level = len(re.search('^#+', line).group(0))
                        heading_markdown = line
                        found_candidate_heading = True
                    elif ((curr_filename_line_number<(len(content_lines)-1)) 
                        and (content_lines[curr_filename_line_number+1].startswith('---'))):
                        # H2 in underline markdown style
                        heading_level = 2
                        heading_markdown = '## ' + line
                        found_candidate_heading = True                    
                        # Skip next line (i.e. the underline)
                        curr_filename_line_number = curr_filename_line_number + 1                     
                    elif ((curr_filename_line_number<(len(content_lines)-1)) 
                        and (content_lines[curr_filename_line_number+1].startswith('==='))):
                        # H1 in underline markdown style
                        heading_level = 1
                        heading_markdown = '# ' + line
                        found_candidate_heading = True
                        # Skip next line (i.e. the underline)
                        curr_filename_line_number = curr_filename_line_number + 1
                    
                    curr_filename_line_number = curr_filename_line_number + 1
                        
                    # If heading is found
                    if found_candidate_heading:
                        '''
                        Seems markdowner sometimes don't convert markdown reference-style link into HTML link if given only 1 line
                        (it gets converted if it's part of the text). Thus, manually apply regex to convert
                        any remaining markdown link
                        '''   
                        overview = overview.append({'section_id':section_id, 'file_id':file_id, 'url':url, 'heading_markdown':heading_markdown,
                                                    'local_readme_file':filename,  'heading_level':heading_level}, ignore_index=True)
                        section_id = section_id + 1  
                       
        file_id = file_id + 1
        
        
    return overview    

In [22]:
df_headings = extract_headings_from_files_in_directory("../input/new_readmes_2/")

In [23]:
df_headings

Unnamed: 0,section_id,file_id,url,heading_markdown,local_readme_file,heading_level
0,1,1,https://github.com/adamstac/jquery-qt-controller,# jQuery QT Controller 0.9.1,adamstac.jquery-qt-controller.md,1
1,2,1,https://github.com/adamstac/jquery-qt-controller,## Function,adamstac.jquery-qt-controller.md,2
2,3,1,https://github.com/adamstac/jquery-qt-controller,## Compatibility,adamstac.jquery-qt-controller.md,2
3,4,1,https://github.com/adamstac/jquery-qt-controller,## Licensing,adamstac.jquery-qt-controller.md,2
4,1,2,https://github.com/asweigart/PythonStdioGames,# PythonStdioGames,asweigart.PythonStdioGames.md,1
...,...,...,...,...,...,...
263,7,23,https://github.com/zachhardesty7/tamper-monkey...,## Usage,zachhardesty7.tamper-monkey-scripts-collection.md,2
264,8,23,https://github.com/zachhardesty7/tamper-monkey...,## Known Issues,zachhardesty7.tamper-monkey-scripts-collection.md,2
265,9,23,https://github.com/zachhardesty7/tamper-monkey...,## Contributing,zachhardesty7.tamper-monkey-scripts-collection.md,2
266,10,23,https://github.com/zachhardesty7/tamper-monkey...,## License,zachhardesty7.tamper-monkey-scripts-collection.md,2


In [24]:
df_headings.to_csv (r'../new_headings_df_2.csv', index = False, header=True)