In [1]:
import os
import os.path as op
from collections import OrderedDict
import nbformat as nbf
from glob import glob
from tqdm import tqdm

import numpy as np

In [2]:
import shutil as sh

In [3]:
ipynb_files = glob('../textbook/notebooks/**/*.ipynb', recursive=True)
md_files = glob('../textbook/chapters/**/*.md', recursive=True)
csv_files = glob('../textbook/notebooks/**/*.csv', recursive=True)

In [4]:
def _between_symbols(string, c1, c2):
    """Will return empty string if nothing is between c1 and c2."""
    for char in [c1, c2]:
        if char not in string:
            raise ValueError("Couldn't find charachter {} in string {}".format(
                char, string))
    return string[string.index(c1)+1:string.index(c2)]

* Loop through markdown files.
    * If there is no *include* statement, just copy over the file
    * If there is an include statement, copy over the relevant ipynb file.
        * In this case, loop over the cells in the ipynb file, look for `.csv`
            * If yes, then replace the path to this file with the relative path to site `root/data`

In [5]:
new_root = '.'

In [25]:
from nbconvert.preprocessors import ExecutePreprocessor


In [29]:
notebook_include_names = []
for ifile in tqdm(md_files):
    # Defining paths
    old_path_file = ifile.split('chapters/')[-1]
    old_path_folder = os.path.dirname(old_path_file)
    new_path_folder = os.path.join(new_root, 'notebooks', old_path_folder)
    path_rel_root = op.relpath(new_root, new_path_folder)
    path_data = op.join(path_rel_root, 'data')

    if not os.path.isdir(new_path_folder):
        os.makedirs(new_path_folder)
    
    with open(ifile, 'r') as ff:
        lines = ff.readlines()

    if not any('{% include' in line for line in lines):
        # Markdown file, just copy it over
        new_path_file = op.join(new_path_folder, op.basename(old_path_file))
        
        lines = [ii.replace('(/images', op.join('(' + path_rel_root, 'images')) for ii in lines]
        with open(new_path_file, 'w') as ff:
            ff.writelines(lines)
    else:
        # Copy over the notebook to the new location
        for line in lines:
            if '{% include' in line:
                include_link = _between_symbols(line, '{', '}')
                include_link = include_link.replace('% include "', '')
                include_link = include_link.replace('" %', '')
                include_link = include_link.split('notebooks-html')[-1].strip('/')
                include_link_nb = include_link.replace('.html', '.ipynb')
                new_nb_path = op.join(new_root, new_path_folder, include_link_nb)
        # Save the MD/NB file for the summary
        notebook_include_names.append((op.basename(ifile), op.basename(new_nb_path)))
        sh.copy2(op.join('../textbook/notebooks', include_link_nb),
                 new_nb_path)
        
    
        # Replace read_table data files with relative path for data
        ntbk = nbf.read(new_nb_path, nbf.NO_CONVERT)
        for cell in ntbk.cells:
            if cell.cell_type != 'code':
                continue
            lines = cell.source.split('\n')
            for ii, line in enumerate(lines):
                if 'http' in line or '.csv' not in line:
                    continue
                if '(' not in line:
                    # Assume it's just data = XXX
                    line = line.split("=")
                    line[-1] = ' path_data +' + line[-1]
                    lines[ii] = '='.join(line)
                else:
                    filename = line.split('.csv')[0]
                    filename = filename.split("(")[-1] + '.csv'
                    new_name = "path_data + " + filename
                    lines[ii] = line.replace(filename, new_name)
            cell.source = '\n'.join(lines)
        
        # Update first cell to define the path_data variable
        meta = [cell for cell in ntbk.cells if '# HIDDEN' in cell.source]
        if len(meta) > 0:
            lines = meta[0].source.split('\n')
            lines.insert(3, "path_data = '{}/'".format(path_data))
            meta[0].source = '\n'.join(lines)
        nbf.write(ntbk, new_nb_path)
        
        
# Copy data
path_data = op.join(new_root, 'data')
if not op.isdir(path_data):
    os.makedirs(path_data)
for icsv in csv_files:
    name = op.basename(icsv)
    new_path = op.join(new_root, 'data', name)
    sh.copy2(icsv, new_path)
    
# Copy over summary file
with open('../textbook/SUMMARY.md', 'r') as ff:
    text = ff.read()
    text = text.replace('chapters/', 'notebooks/')
    
    for imd, inb in notebook_include_names:
        text = text.replace(os.sep+imd, os.sep+inb)
    with open('./SUMMARY.md', 'w') as ff:
        ff.write(text)
        
# Copy over readme file
add_lines = ["---",
             "author_profile: false",
             "layout: textbook",
             "permalink: /",
             "sidebar:",
             "    nav: sidebar-textbook",
             "---"]
add_lines = [ii+'\n' for ii in add_lines]
with open('../textbook/README.md', 'r') as ff:
    text = ff.readlines()
    text = add_lines + ['\n\n'] + text
    with open('./intro.md', 'w') as ff:
        ff.writelines(text)
        
# Copy images
image_files = glob('../textbook/images/*.png') + glob('../textbook/images/*.jpg')
for img in image_files:
    name = op.basename(img)
    sh.copy2(img, op.join('./images', name))

100%|██████████| 95/95 [00:03<00:00, 24.91it/s]


## Scratch

In [25]:
search_text = '.csv'

for ifile in tqdm(ipynb_files):
    ntbk = nbf.read(ifile, nbf.NO_CONVERT)

    for cell in ntbk.cells:
        if search_text in cell.source and cell.cell_type == 'code':
            text = cell.source
            ix_text = cell.source.find(search_text)
            print(text[ix_text-20:ix_text+10])

  5%|▌         | 9/173 [00:00<00:04, 37.80it/s]

ble.read_table('baby.csv')
bab
ble.read_table('baby.csv')
bab
ble.read_table('wine.csv')

# 
table('breast-cancer.csv').dro
e.read_table('galton.csv')
gal
able('roulette_wheel.csv').col


  9%|▊         | 15/173 [00:00<00:05, 28.06it/s]

ble.read_table('trip.csv')
tri
.read_table('station.csv')
sta
('san_francisco_2015.csv')
ble.read_table('baby.csv')


 10%|█         | 18/173 [00:00<00:05, 26.94it/s]

table('breast-cancer.csv').dro
C-EST2014-AGESEX-RES.csv'
full
able.read_table('bta.csv')
bta
e("observed_outcomes.csv")
obs
able('airline_ontime.csv')
ont


 14%|█▍        | 25/173 [00:00<00:05, 27.32it/s]

able('roulette_wheel.csv').col
e('united_summer2015.csv')
r_ethnicity_everyone.csv')
chi
ble.read_table('baby.csv')
bab
able.read_table('ckd.csv').rel
read_table('banknote.csv')
ban
table('breast-cancer.csv').dro


 16%|█▌        | 28/173 [00:01<00:05, 27.48it/s]

able.read_table('ckd.csv').rel
able.read_table('ckd.csv').rel
read_table('banknote.csv')
ban
table('breast-cancer.csv').dro
able.read_table('ckd.csv').rel
read_table('banknote.csv')
ban
table('breast-cancer.csv').dro
ble.read_table('wine.csv')


 20%|██        | 35/173 [00:01<00:05, 25.68it/s]

_table('nba_salaries.csv')
nba
ble.read_table('imdb.csv')
box
_table('income_small.csv')
CA_


 22%|██▏       | 38/173 [00:01<00:05, 25.09it/s]

ad_table('birth_time.csv').sel
ad_table('birth_time.csv').dro
ble.read_table("trip.csv").whe
ble.read_table('baby.csv')


 25%|██▌       | 44/173 [00:01<00:05, 22.39it/s]

e.read_table('hybrid.csv')
.read_table('sat2014.csv').sor
ad_table('hybrid_reg.csv') # h
.read_table('sat2014.csv').sor
read_table('educ_inc.csv')
ca_
e('scores_by_section.csv')
sco
d_table('deflategate.csv')
foo


 29%|██▉       | 50/173 [00:02<00:05, 22.67it/s]

e('united_summer2015.csv')
ble.read_table('baby.csv')
e('united_summer2015.csv')
_table('little_women.csv')
lit
read_table('faithful.csv')
fai
.read_table('sat2014.csv')
sat
e('scores_by_section.csv')
sec


 35%|███▌      | 61/173 [00:02<00:04, 24.60it/s]

read_table('football.csv')
foo
c-est2015-agesex-res.csv'

# A
e.read_table('galton.csv')
c-est2015-agesex-res.csv'

# A
Table.read_table('IV.csv').dro
ad_table('treecover2.csv.gz', 


 40%|███▉      | 69/173 [00:02<00:04, 24.69it/s]

ble.read_table('baby.csv')
bab
table('breast-cancer.csv').dro
ble.read_table('wine.csv')
read_table('banknote.csv')
ban
ble.read_table('wine.csv')

# 
ble.read_table('baby.csv')


 45%|████▍     | 77/173 [00:03<00:03, 25.51it/s]

le.read_table('cones.csv')
nba
ble.read_table('trip.csv')
tri
.read_table('station.csv')
sta
.read_table('shotput.csv')


 49%|████▉     | 85/173 [00:03<00:03, 25.52it/s]

_table('little_women.csv')
lit
ad_table('treecover2.csv.gz', 
e('grades_and_piazza.csv')
gra


 51%|█████     | 88/173 [00:03<00:03, 25.32it/s]

le.read_table('birds.csv')
bir
= pd.read_csv('birds.csv')
df_
le.read_table('house.csv')
sal
able.read_table('ckd.csv').rel


 54%|█████▍    | 94/173 [00:03<00:03, 25.29it/s]

able.read_table('ckd.csv').rel
read_table('banknote.csv')
ban
able('airline_ontime.csv')
ua 
ble.read_table('baby.csv')
bir
e.read_table('galton.csv')
hei
table('galton_subset.csv')
hei
c-est2015-agesex-res.csv'
full
d_table('usa_ca_2014.csv')
usa


 57%|█████▋    | 98/173 [00:03<00:02, 25.12it/s]

e('scores_by_section.csv')
sco
.read_table('couples.csv').sel
read_table('football.csv')
foo
e.read_table('galton.csv')
hei
ble.read_table('baby.csv')


 61%|██████    | 105/173 [00:04<00:02, 25.27it/s]

read_table('hodgkins.csv')
hod
ad_table('./all-lprs.csv.gz', 
('san_francisco_2015.csv').whe


 65%|██████▍   | 112/173 [00:04<00:02, 25.02it/s]

.read_table('heights.csv')
hei
ble.read_table('baby.csv')
hyb
read_table('us_women.csv')
cor
ble.read_table('baby.csv')
e.read_table('galton.csv')

he
ble.read_table('baby.csv')


 69%|██████▉   | 119/173 [00:04<00:02, 25.15it/s]

ble.read_table('baby.csv')
hyb
read_table('us_women.csv')
cor
able.read_table('ckd.csv').rel
ad_table('top_movies.csv')
top
ad_table('top_movies.csv')
top
e('united_summer2015.csv')
uni
ad_table('top_movies.csv')
top
ble.read_table('baby.csv')


 73%|███████▎  | 126/173 [00:04<00:01, 25.29it/s]

_table('nba_salaries.csv')
nba
_table('nba_salaries.csv')
nba
able('airline_ontime.csv')
ua 
.read_table("nba2013.csv")
nba


 76%|███████▌  | 131/173 [00:05<00:01, 25.73it/s]

ble('married_couples.csv').sel
e.read_table('minard.csv')
min
e('scores_by_section.csv')
sco
able.read_table('ckd.csv').rel


 82%|████████▏ | 141/173 [00:05<00:01, 26.37it/s]

table('breast-cancer.csv').dro
ble.read_table('baby.csv')
read_table('hodgkins.csv')
.read_table('nba2013.csv')
nba
e('united_summer2015.csv')
uni
e('united_summer2015.csv')
del
e.read_table('actors.csv')
act
able('movies_by_year.csv')
mov


 84%|████████▍ | 145/173 [00:05<00:01, 26.22it/s]

ad_table('top_movies.csv')
top
ad_table('top_movies.csv')
# M
e.read_table('galton.csv')
hei


 91%|█████████▏| 158/173 [00:05<00:00, 27.42it/s]

le.read_table('cones.csv')
nba


 97%|█████████▋| 168/173 [00:06<00:00, 27.91it/s]

ad_table('top_movies.csv')
top
e('united_summer2015.csv')
uni
e('united_summer2015.csv')


100%|██████████| 173/173 [00:06<00:00, 27.84it/s]

e('scores_by_section.csv')
sco
ble.read_table('baby.csv')
bab
d_table('deflategate.csv')
foo
able.read_table('bta.csv')
bta
e("observed_outcomes.csv")
obs



