In [5]:
import pandas
import re

In [6]:

def load_section_overview_from_csv(input_filename_csv):
    df = pandas.read_csv(input_filename_csv, header=0, delimiter=',',
                     names=['section_id','file_id','url','heading_markdown','section_code'])
    
    readme_file_generator = lambda x: x.replace('https://github.com/','').replace('/','.') + '.md'
    df['local_readme_file'] = df['url'].apply(readme_file_generator) 
    # df['heading_text'] = df['heading_markdown'].apply(extract_text_in_heading_markdown)
    # In markdown, # = heading level 1, ## = heading level 2, etc.
    df['heading_level'] = df['heading_markdown'].apply(lambda x : len(re.search('^#+', x).group(0)))
    # df['abstracted_heading_markdown'] = df['heading_markdown'].apply(lambda x : abstract_text(x).replace('\n', ' ').strip())
    # df['abstracted_heading_text'] = df['abstracted_heading_markdown'].apply(extract_text_in_heading_markdown)
    # Don't convert to int, as data contains '-' for 'not in any class'
    # df['section_code'] = df['section_code'].apply(lambda x : merge_classes_1_and_2(x))
    
    return df

In [7]:
    csv_filename = '../input/dataset_combined.csv'
    readme_path = '../input/dev_and_eval_readmes/'

    df = load_section_overview_from_csv(csv_filename)

In [10]:
df.head()

Unnamed: 0,section_id,file_id,url,heading_markdown,section_code,local_readme_file,heading_level
0,1,1,https://github.com/xiaobai557/wechat,# [Easy WeChat](http://easywechat.org),-,xiaobai557.wechat.md,1
1,2,1,https://github.com/xiaobai557/wechat,## 特点,-,xiaobai557.wechat.md,2
2,3,1,https://github.com/xiaobai557/wechat,## 环境要求,-,xiaobai557.wechat.md,2
3,4,1,https://github.com/xiaobai557/wechat,## 安装,-,xiaobai557.wechat.md,2
4,5,1,https://github.com/xiaobai557/wechat,## 使用,-,xiaobai557.wechat.md,2


In [41]:
len(df)

4922

In [36]:
def extract_section_from_files(file_path):
    curr_filename = None
    curr_filename_lines = None
    curr_filename_line_number = 0

    sections = []
    
    # Section definition: lines between current heading and next, regardless of level
    # Can't simply use equivalent of left JOIN between list of headings and actual headings in file,
    # as a file may have a several heading with same text and level (e.g. multiple "Example" subheadings, one for each method in a reference section)
    for i, r in df.iterrows():
        heading_already_found = False
        
        local_readme_filename = r[5]
        heading_markdown = r[3]

        if (curr_filename is None) or (curr_filename != local_readme_filename):
            curr_filename = local_readme_filename
            with open (file_path + curr_filename, "r", encoding='utf-8') as myfile:
                # Read as is, use rstrip instead of strip to only remove trailing whitespace
                # We want to preserve leading whitespace to avoid treating line starting with space/tab followed by #
                # from being treated as heading
                curr_filename_lines = myfile.readlines()                    
                curr_filename_lines = [x.rstrip() for x in curr_filename_lines]
        curr_filename_line_number = 0 
        
        curr_section_content_lines = []
        # Iterate through file until heading markdown is found or end of file is found
        # Check also for underline-style formatting
        while (curr_filename_line_number < len(curr_filename_lines)):
            '''
            If a candidate heading is found, compare it with the heading we're looking for.
            If it's actually the one we want, set a flag, so if the next heading happens to have same string,
            we can tell that it's a different heading.
            '''
            if curr_filename_lines[curr_filename_line_number].startswith('#'):
                # Potential heading, starting with #. Is it the heading we want?
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if ((candidate_heading != heading_markdown.strip()) or heading_already_found):
                    break
                else:
                    heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and
                 curr_filename_lines[curr_filename_line_number + 1].startswith('===')):
                # Potential H1, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('# ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                        heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and 
                 curr_filename_lines[curr_filename_line_number+1].startswith('---')):
                # Potential H2, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('## ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                        heading_already_found = True
            else:
                curr_section_content_lines.append(curr_filename_lines[curr_filename_line_number])
                
            # Proceed to next line
            curr_filename_line_number += 1
            
        curr_section_content = ' '.join(curr_section_content_lines)
        sections.append(curr_section_content)
    return sections

In [38]:
l = extract_section_from_files(readme_path)

In [39]:
len(l)

4922

In [47]:
l[:15]

[' 可能是目前最优雅的微信公众平台 SDK 了。  > 不支持企业号，也不打算支持，原因？微信的API实在设计得太乱了，我怕累死。。。  - SDK 官方论坛：https://forum.easywechat.org - SDK 使用交流 QQ 群：`319502940` - 微信开发者交流 QQ 群：`9179779`  [![Build Status](https://travis-ci.org/overtrue/wechat.svg?branch=master)](https://travis-ci.org/overtrue/wechat) [![Latest Stable Version](https://poser.pugx.org/overtrue/wechat/v/stable.svg)](https://packagist.org/packages/overtrue/wechat) [![Latest Unstable Version](https://poser.pugx.org/overtrue/wechat/v/unstable.svg)](https://packagist.org/packages/overtrue/wechat) [![Build Status](https://scrutinizer-ci.com/g/overtrue/wechat/badges/build.png?b=master)](https://scrutinizer-ci.com/g/overtrue/wechat/build-status/master) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/overtrue/wechat/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/overtrue/wechat/?branch=master) [![Code Coverage](https://scrutinizer-ci.com/g/overtrue/wechat/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/overtrue/wec

In [44]:
df['section_contents'] = l

In [45]:
df

Unnamed: 0,section_id,file_id,url,heading_markdown,section_code,local_readme_file,heading_level,section_contents
0,1,1,https://github.com/xiaobai557/wechat,# [Easy WeChat](http://easywechat.org),-,xiaobai557.wechat.md,1,可能是目前最优雅的微信公众平台 SDK 了。 > 不支持企业号，也不打算支持，原因？微信...
1,2,1,https://github.com/xiaobai557/wechat,## 特点,-,xiaobai557.wechat.md,2,
2,3,1,https://github.com/xiaobai557/wechat,## 环境要求,-,xiaobai557.wechat.md,2,
3,4,1,https://github.com/xiaobai557/wechat,## 安装,-,xiaobai557.wechat.md,2,
4,5,1,https://github.com/xiaobai557/wechat,## 使用,-,xiaobai557.wechat.md,2,
...,...,...,...,...,...,...,...,...
4917,4918,435,https://github.com/bdurand/whenever,### Testing,3,bdurand.whenever.md,3,Whenever is a Ruby gem that provides a clear s...
4918,4919,435,https://github.com/bdurand/whenever,### Credit,5,bdurand.whenever.md,3,Whenever is a Ruby gem that provides a clear s...
4919,4920,435,https://github.com/bdurand/whenever,### Discussion / Feedback / Issues / Bugs,6,bdurand.whenever.md,3,Whenever is a Ruby gem that provides a clear s...
4920,4921,435,https://github.com/bdurand/whenever,##,-,bdurand.whenever.md,2,Whenever is a Ruby gem that provides a clear s...


In [46]:
df.to_csv (r'../README_df.csv', index = False, header=True)