In [1]:
from helper2 import *

In [2]:
import pandas
import re

In [3]:
def generate_abstract_text(text):
    abstract = []
    for section in text:
        abstract.append(abstract_text(section))
    return abstract

In [4]:
def load_section_overview_from_csv(input_filename_csv):
    df = pandas.read_csv(input_filename_csv, header=0, delimiter=',',
                     names=['section_id','file_id','url','heading_markdown','section_code'])
    
    readme_file_generator = lambda x: x.replace('https://github.com/','').replace('/','.') + '.md'
    df['local_readme_file'] = df['url'].apply(readme_file_generator) 
    # df['heading_text'] = df['heading_markdown'].apply(extract_text_in_heading_markdown)
    # In markdown, # = heading level 1, ## = heading level 2, etc.
    df['heading_level'] = df['heading_markdown'].apply(lambda x : len(re.search('^#+', x).group(0)))
    # df['abstracted_heading_markdown'] = df['heading_markdown'].apply(lambda x : abstract_text(x).replace('\n', ' ').strip())
    # df['abstracted_heading_text'] = df['abstracted_heading_markdown'].apply(extract_text_in_heading_markdown)
    # Don't convert to int, as data contains '-' for 'not in any class'
    # df['section_code'] = df['section_code'].apply(lambda x : merge_classes_1_and_2(x))
    
    return df

In [5]:
csv_filename = '../input/dataset_combined.csv'
readme_path = '../input/dev_and_eval_readmes/'
df = load_section_overview_from_csv(csv_filename)

In [6]:
df.heading_markdown.tolist()

['# [Easy WeChat](http://easywechat.org)',
 '## 特点',
 '## 环境要求',
 '## 安装',
 '## 使用',
 '## 文档',
 '## 框架集成',
 '## 贡献代码',
 '## License',
 '# ThinkPHP 5.0',
 '## 目录结构',
 '## 命名规范',
 '### 目录和文件',
 '### 函数和类、属性命名',
 '### 常量和配置',
 '### 数据表和字段',
 '## 参与开发',
 '## 版权信息',
 '# SYMON - A 6502 System Simulator',
 '## 1.0 About',
 '## 2.0 Requirements',
 '## 3.0 Features',
 '### 3.1 Memory Maps',
 '#### 3.1.1 Symon Memory Map',
 '#### 3.1.2 MULTICOMP Memory Map',
 '### 3.1.3 Simple Memory Map',
 '### 3.2 Serial Console and CPU Status',
 '### 3.3 ROM Loading',
 '### 3.4 Memory Window',
 '### 3.5 Trace Log',
 '### 3.6 Simulator Speeds',
 '### 3.7 Breakpoints',
 '### 3.8 Experimental 6545 CRTC Video',
 '#### 3.8.1 Example BASIC Program to test Video',
 '## 4.0 Usage',
 '### 4.1 Building',
 '### 4.2 ROM images',
 '### 4.3 Loading A Program',
 '### 4.4 Running',
 '## 5.0 Revision History',
 '## 6.0 Roadmap',
 '## 7.0 To Do',
 '## 8.0 Copyright and Acknowledgements',
 '## 9.0 Licensing',
 '# Code Along: An

In [7]:
def extract_section_from_files(file_path):
    curr_filename = None
    curr_filename_lines = None
    curr_filename_line_number = 0
    sections = []
    # Section definition: lines between current heading and next, regardless of level
    # Can't simply use equivalent of left JOIN between list of headings and actual headings in file,
    # as a file may have a several heading with same text and level (e.g. multiple "Example" subheadings, one for each method in a reference section)
    for i, r in df.iterrows():
        heading_already_found = False
        local_readme_filename = r[5]
        heading_markdown = r[3]
        if (curr_filename is None) or (curr_filename != local_readme_filename):
            curr_filename = local_readme_filename
            with open (file_path + curr_filename, "r", encoding='utf-8') as myfile:
                # Read as is, use rstrip instead of strip to only remove trailing whitespace
                # We want to preserve leading whitespace to avoid treating line starting with space/tab followed by #
                # from being treated as heading
                curr_filename_lines = myfile.readlines()                    
                curr_filename_lines = [x.rstrip() for x in curr_filename_lines]
                curr_filename_line_number = 0 #Indent this line to be in the if block
        curr_section_content_lines = []
        # Iterate through file until heading markdown is found or end of file is found
        # Check also for underline-style formatting
        while (curr_filename_line_number < len(curr_filename_lines)):
            '''
            If a candidate heading is found, compare it with the heading we're looking for.
            If it's actually the one we want, set a flag, so if the next heading happens to have same string,
            we can tell that it's a different heading.
            '''
            if curr_filename_lines[curr_filename_line_number].startswith('#'):
                # Potential heading, starting with #. Is it the heading we want?
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if ((candidate_heading != heading_markdown.strip()) or heading_already_found):
                    break
                else:
                    heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and
                 curr_filename_lines[curr_filename_line_number + 1].startswith('===')):
                # Potential H1, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('# ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                    heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and 
                 curr_filename_lines[curr_filename_line_number+1].startswith('---')):
                # Potential H2, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('## ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                    heading_already_found = True
            else:
                curr_section_content_lines.append(curr_filename_lines[curr_filename_line_number])
            # Proceed to next line
            curr_filename_line_number += 1
        curr_section_content = ' '.join(curr_section_content_lines)
        sections.append(curr_section_content)
    return sections

In [8]:
df['section_contents'] = extract_section_from_files(readme_path)

In [9]:
df.section_contents

0        可能是目前最优雅的微信公众平台 SDK 了。  > 不支持企业号，也不打算支持，原因？微信...
1         - 命名不那么乱七八糟；  - 隐藏开发者不需要关注的细节；  - 方法使用更优雅，不必...
2        1. PHP >= 5.5.9 2. **[composer](https://getco...
3        ```shell composer require "overtrue/wechat:~3...
4        基本使用（以服务端为例）:  ```php <?php  use EasyWeChat\F...
                              ...                        
4917     [whenever-test](https://github.com/heartbits/...
4918     Whenever was created for use at Inkling (<htt...
4919     For general discussion and questions, please ...
4920    ----  [![Build Status](https://secure.travis-c...
4921           ----  Copyright &copy; 2016 Javan Makhmali
Name: section_contents, Length: 4922, dtype: object

In [10]:
df['abstract_heading_markdown'] = generate_abstract_text(df.heading_markdown.tolist())

In [11]:
df.abstract_heading_markdown

0                            #  @abstr_hyperlink \n
1                                           ## 特点\n
2                                         ## 环境要求\n
3                                           ## 安装\n
4                                           ## 使用\n
                           ...                     
4917                                  ### Testing\n
4918                                   ### Credit\n
4919    ### Discussion / Feedback / Issues / Bugs\n
4920                                          # #\n
4921                                          # #\n
Name: abstract_heading_markdown, Length: 4922, dtype: object

In [12]:
def generate_extract_text_from_markdown_snippet(text):
    contents = []
    for snippet in text:
        contents.append(extract_text_from_markdown_snippet(snippet))
    return contents

In [13]:
def generate_extract_text_in_heading_markdown(text):
    headings = []
    for header in text:
        headings.append(extract_text_in_heading_markdown(header))
    return headings

In [15]:
extracted_contents = generate_extract_text_from_markdown_snippet(df.section_contents.to_list())

In [16]:
headings = generate_extract_text_in_heading_markdown(df.heading_markdown.to_list())

In [17]:
df['headings'] = headings

In [21]:
df['abstract_headings'] = generate_extract_text_in_heading_markdown(df.abstract_heading_markdown.to_list())

In [22]:
df[['headings','heading_markdown','abstract_heading_markdown','abstract_headings']]

Unnamed: 0,headings,heading_markdown,abstract_heading_markdown,abstract_headings
0,Easy WeChat,# [Easy WeChat](http://easywechat.org),# @abstr_hyperlink \n,@abstr_hyperlink
1,特点,## 特点,## 特点\n,特点
2,环境要求,## 环境要求,## 环境要求\n,环境要求
3,安装,## 安装,## 安装\n,安装
4,使用,## 使用,## 使用\n,使用
...,...,...,...,...
4917,Testing,### Testing,### Testing\n,Testing
4918,Credit,### Credit,### Credit\n,Credit
4919,Discussion / Feedback / Issues / Bugs,### Discussion / Feedback / Issues / Bugs,### Discussion / Feedback / Issues / Bugs\n,Discussion / Feedback / Issues / Bugs
4920,#,##,# #\n,#


In [20]:
df['extracted_contents'] = extracted_contents

In [25]:
df['abstract_section_contents'] = generate_abstract_text(df.section_contents.to_list())

In [26]:
df[['extracted_contents','section_contents','abstract_section_contents']]

Unnamed: 0,extracted_contents,section_contents,abstract_section_contents
0,可能是目前最优雅的微信公众平台 SDK 了。 > 不支持企业号，也不打算支持，原因？微信的...,可能是目前最优雅的微信公众平台 SDK 了。 > 不支持企业号，也不打算支持，原因？微信...,可能是目前最优雅的微信公众平台 SDK 了。 > 不支持企业号，也不打算支持，原因？微信的A...
1,命名不那么乱七八糟； - 隐藏开发者不需要关注的细节； - 方法使用更优雅，不必再去研...,- 命名不那么乱七八糟； - 隐藏开发者不需要关注的细节； - 方法使用更优雅，不必...,* 命名不那么乱七八糟； - 隐藏开发者不需要关注的细节； - 方法使用更优雅，不必再去...
2,PHP >= 5.5.9 2. composer 3. openssl 拓展 4. fil...,1. PHP >= 5.5.9 2. **[composer](https://getco...,@abstr_number . PHP >= @abstr_number . @abstr_...
3,"shell composer require ""overtrue/wechat:~3.1"" ...","```shell composer require ""overtrue/wechat:~3...","`shell composer require ""overtrue/wechat:~ @ab..."
4,基本使用（以服务端为例）: php <?php use EasyWeChat\Found...,基本使用（以服务端为例）: ```php <?php use EasyWeChat\F...,基本使用（以服务端为例）: `php <?php use EasyWeChat\Founda...
...,...,...,...
4917,whenever-test is an extension to Whenever for ...,[whenever-test](https://github.com/heartbits/...,@abstr_hyperlink is an extension to Whenever f...
4918,Whenever was created for use at Inkling (http:...,Whenever was created for use at Inkling (<htt...,Whenever was created for use at Inkling ( @abs...
4919,"For general discussion and questions, please u...","For general discussion and questions, please ...","For general discussion and questions, please u..."
4920,----,---- [![Build Status](https://secure.travis-c...,\---- @abstr_hyperlink \n
