# Read corpus

Columns:
* sentence
* sentence # overall
* sentence # within doc
* sentence # within section
* sentence # within paragraph
* paragraph # overall
* paragraph # within section
* paragraph # within doc
* section title
* section level
* section # overall
* section # within doc
* doc title
* doc # overall

Separators:
* nltk to tokenize sentences within a paragraph
* paragraphs sep by new lines
* sections: "= = title = =" (or more " = ")
* articles: "= title ="

separately, we'll make sentence embedding matrics
which will match to sentences by their sentence # overall

In [72]:
import pandas as pd
import re
from copy import deepcopy
from pprint import pprint
from nltk.tokenize import sent_tokenize

1. Create a list of dictionaries in which each dictionary corresponds to an input data row.
2. Create a data frame from this list.

```
rows_list = []
for row in input_rows:

        dict1 = {}
        # get input row in dictionary format
        # key = col_name
        dict1.update(blah..) 

        rows_list.append(dict1)

df = pd.DataFrame(rows_list)   
```

(https://stackoverflow.com/a/17496530/6350352)

In [80]:
def new_sec(title, level, sentence_data):
    sentence_data["sec_level"] = level
    sentence_data["sec_id"] += 1
    if (level == 0):
        # new doc
        sentence_data["doc_title"] = title
        sentence_data["sec_title"] = ""
        sentence_data["doc_id"] += 1
        sentence_data["s_in_doc"] = -1
        sentence_data["par_in_doc"] = -1
        sentence_data["sec_in_doc"] = 0
    else:
        # new sec, same doc
        sentence_data["sec_title"] = title
        sentence_data["s_in_sec"] = -1
        sentence_data["par_in_sec"] = -1
        sentence_data["sec_in_doc"] += 1
    return sentence_data

def new_par(sentence_data):
    sentence_data["par_id"] += 1
    sentence_data["par_in_sec"] += 1
    sentence_data["par_in_doc"] += 1
    sentence_data["s_in_par"] = -1
    return sentence_data

def new_s(sentence_data):
    sentence_data["sentence"] = sentence
    sentence_data["s_id"] += 1
    sentence_data["s_in_par"] += 1
    sentence_data["s_in_sec"] += 1
    sentence_data["s_in_doc"] += 1
    return sentence_data

def init_sentence_data():
    return {
        # sentence string
        "sentence": "",

        # sentence IDs
        "s_id": -1,
        "s_in_doc": -1,
        "s_in_sec": -1,
        "s_in_par": -1,

        # paragraph IDs
        "par_id": -1,
        "par_in_doc": -1,
        "par_in_sec": -1,

        # section IDs
        "sec_title": "",
        "sec_level": -1,
        "sec_id": -1,
        "sec_in_doc": -1,

        # document IDs
        "doc_title": "",
        "doc_id": -1
    }

In [100]:
# filenames
input_data_file = "data/wiki/wiki.train.raw"

output_data_file = "data/wiki/wiki_reformatted.csv"
# later: wiki_skipthoughts, wiki_dissent, etc.

In [97]:
# initialize sentence data
sentences_list = []
level_signifier = " ="
sentence_data = init_sentence_data()
# recognize titles of articles and sections
title_pattern = r"=(?:%s)* (.*?)((?:%s)+)$" %(level_signifier, level_signifier)
# read through wikipedia file
for line in open(input_data_file, "r"):
    line = line.strip()
    title_match = re.match(title_pattern, line)
    if (title_match):
        title_elements = title_match.groups()
        title = title_elements[0]
        # level based on number of equals signs
        level = len(re.findall(level_signifier, title_elements[1])) - 1
        sentence_data = new_sec(title, level, sentence_data)
    elif (line != ""):
        # new paragraph
        sentence_data = new_par(sentence_data)
        # use nltk to tokenize into sentences
        sentences = sent_tokenize(line)
        for sentence in sentences:
            sentence_data = new_s(sentence_data)
            sentences_list.append(deepcopy(sentence_data))

CPU times: user 9.45 s, sys: 24 ms, total: 9.47 s
Wall time: 12.5 s


In [98]:
df = pd.DataFrame(sentences_list)

CPU times: user 632 ms, sys: 96 ms, total: 728 ms
Wall time: 1.08 s


In [101]:
df.to_csv(output_data_file)

CPU times: user 2.45 s, sys: 40 ms, total: 2.49 s
Wall time: 2.92 s


```
jupyter nbconvert --to script read_data.ipynb
```