In [None]:
import sys
sys.path.append("..")

import random
import math
import itertools
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Optional, Callable, List, Tuple, Iterable, Generator, Union, Dict

import PIL.Image
import PIL.ImageDraw
import plotly
import plotly.express as px
import plotly.graph_objects as go
plotly.io.templates.default = "plotly_dark"
import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, IterableDataset, RandomSampler
import torchvision.transforms as VT
import torchvision.transforms.functional as VF
from torchvision.utils import make_grid
from IPython.display import display, Audio, HTML
import torchaudio
from torchaudio.io import StreamReader
import bs4

from src.datasets import *
from src.algo import GreedyLibrary
from src.util.image import *
from src.util import to_torch_device
from src.patchdb import PatchDB, PatchDBIndex
from src.models.encoder import *
from src.util.audio import *
from src.util.files import *

In [None]:
source_filename = Path("~/prog/python/github/blog/src/audio/scrape_chords/e-chord-dump.ndjson").expanduser()

def iter_source():
    yield from tqdm(iter_ndjson(source_filename))

In [None]:
for i, entry in enumerate(iter_source()):
    if i == 23:
        for key, value in entry.items():
            if key != "text":
                print(f"{key:10}: {value}")
                
        soup = bs4.BeautifulSoup(entry["text"], features="html.parser")
        print(soup)
        break

In [None]:
def html_to_text(soup):
    converted_text = []
    
    def _recurse(elem):
        tag = getattr(elem, "name", None)
        has_children = hasattr(elem, "children")
        is_text = isinstance(elem, bs4.element.NavigableString)
        
        if is_text:
            converted_text.append(elem.text)

        if has_children:
            if tag == "u":
                converted_text.append("*")
            elif tag == "i":
                converted_text.append("~")
                
            for child in elem.children:
                _recurse(child)

            if tag == "u":
                converted_text.append("*")
            elif tag == "i":
                converted_text.append("~")
            
    _recurse(soup)

    return "".join(converted_text).strip()

print(html_to_text(soup))

In [None]:
with NDJson("../datasets/echords.ndjson.gz", "w") as fp:
    for i, entry in enumerate(iter_source()):
        soup = bs4.BeautifulSoup(entry["text"], features="html.parser")
        entry["text"] = html_to_text(soup) 
        fp.write(entry)

In [None]:
!ls -l ../datasets/

underline

In [None]:
class EChordsIterableDataset(IterableDataset):

    def __init__(self):
        self.filename = Path("~/prog/python/github/nn-experiments/datasets/echords.ndjson.gz").expanduser()
        assert self.filename.exists(), f"Did not find: {self.filename}"

    def __iter__(self):
        for data in iter_ndjson(self.filename):
            for key, value in data.items():
                if value is None:
                    data[key] = ""
                    
            data["text"] = data["text"].replace("\r", "")
            yield data


ds = EChordsIterableDataset()
ds = IterableShuffle(ds, 100)

for entry in ds:
    for key, value in entry.items():
        if key != "text":
            print(f"{key:10}: {value}")
            
    print(entry["text"])
    break

In [None]:
for entry in ds:
    for key, value in entry.items():
        if value is None:
            print(key, value)