In [5]:
import rich
from loguru import logger
from collections import defaultdict
from unidecode import unidecode
import string
from readme2book import ReadmeParser, Stub

In [4]:
%pip install loguru unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
from collections import defaultdict
import json
from readme2book import ReadmeParser


def collect_docs(parser):
    items = []
    for entry in parser.entries:
        heading = entry['heading']
        if 'subheadings' not in entry: # yeesh...
            continue
        for subheading, item in entry['subheadings'].items():
            #stub = Stub(item)
            tags=[heading, subheading]
            for rec in item:
                if not rec.get('tags'):
                    rec['tags'] = tags
            items.extend(item)
    return {rec['stub_name']:rec for rec in items if rec.get('stub_name')}


def aggregate(docs, key):
    d_ = defaultdict(list)
    for doc_id, doc in docs.items():
        record = doc[key]
        if isinstance(record, list):
            for item in record:
                d_[item].append(doc_id)
        else:
            d_[record].append(doc_id)
    return dict(sorted((d_.items())))


parser = ReadmeParser()
docs = collect_docs(parser)

authors = aggregate(docs, 'authors')
tags = aggregate(docs, 'tags')
years = aggregate(docs, 'year')


partial_parses=[]
for entry in parser.entries:
    if 'subheadings' not in entry:
        continue
    for subheading, item in entry['subheadings'].items():
        for rec in item:
            if not rec.get('stub_name'):
                partial_parses.append(rec)


db=dict(
    docs=docs,
    authors=authors,
    tags=tags,
    years=years,
    errors=dict(
        parser_errors=parser.errors,
        partial_parses=partial_parses,
    ),
)



with open("db.json", 'w') as f:
    json.dump(db, f)




In [2]:
with open("db.json", 'r') as f:
    outstr=json.load(f)
outstr

{'docs': {'1996_RobertTibshirani_RegressionShrinkageAndSelectio': {'year': 1996,
   'title_part': '"Regression Shrinkage and Selection via the Lasso"',
   'url': 'https://statweb.stanford.edu/~tibs/lasso/lasso.pdf',
   'is_pdf': 'True',
   'authors': ['Robert Tibshirani'],
   'stub_name': '1996_RobertTibshirani_RegressionShrinkageAndSelectio',
   'tags': ['"Classic" ML', 'Lasso/elasticnet']},
  '2005_HuiZou_RegularizationAndVariableSelec': {'year': 2005,
   'title_part': '"Regularization and variable selection via the elastic net"',
   'url': 'https://web.stanford.edu/~hastie/Papers/B67.2%20(2005)%20301-320%20Zou%20&%20Hastie.pdf',
   'is_pdf': 'True',
   'authors': ['Hui Zou', 'Trevor Hastie'],
   'stub_name': '2005_HuiZou_RegularizationAndVariableSelec',
   'tags': ['"Classic" ML', 'Lasso/elasticnet']},
  '1990_RobertESchapire_TheStrengthOfWeakLearnability': {'year': 1990,
   'title_part': '"The Strength of Weak Learnability"',
   'url': 'http://rob.schapire.net/papers/strengthofweak

In [10]:
partial_parses=[]
for entry in parser.entries:
    if 'subheadings' not in entry:
        continue
    for subheading, item in entry['subheadings'].items():
        for rec in item:
            if not rec.get('stub_name'):
                partial_parses.append(rec)


db=dict(
    docs=docs,
    authors=authors,
    tags=tags,
    years=years,
    errors=dict(
        parser_errors=parser.errors,
        partial_parses=partial_parses,
    ),
)

import json

with open("db.json", 'w') as f:
    json.dump(db, f)

In [10]:
parser.errors

['* https://cseweb.ucsd.edu/~dasgupta/papers/randomf.pdf',
 '* See also Johnson-Lindenstrauss lemma',
 '* 1938 - "The Law of Anomalous Numbers" - Frank Benford',
 '* 1881 -  "Note on the frequency of use of the different digits in natural numbers" - Simon Newcomb',
 '* 1966 - "The Scree Test For The Number Of Factors" - Raymond B. Cattell',
 '* Uh... here there be dragons. Maybe just leave some breadcrumbs here?',
 '* Probably discussed sufficiently in the Adam paper',
 '* see backprop',
 '* See also AlexNet',
 '* see also Alex Graves 2013',
 '## Computer Vision / representation learning',
 '## NLP',
 '* ULM',
 '* GPT-2 / GPT-3',
 '## Representation Learning',
 '* see also knowledge distillation below',
 '## Misc',
 '* 1998 - "On Measuring and Correcting the Effects of Data Mining and Model Selection" - Jianming Ye',
 '* 1943 - "On the stability of inverse problems" - L2 regularization introduced by Tikhonov, original paper in Russian',
 '* 1984 - "Extensions of Lipschitz mappings into

In [13]:
import json

with open("db.json", 'w') as f:
    json.dump(db, f)

In [11]:
# other parsing errors

partial_parses=[]
for entry in parser.entries:
    if 'subheadings' not in entry:
        continue
    for subheading, item in entry['subheadings'].items():
        for rec in item:
            if not rec.get('stub_name'):
                #print(rec)
                partial_parses.append(rec)
                
partial_parses

[{'rest': [' "The Law of Anomalous Numbers" ', ' Frank Benford'],
  'year': 1938,
  'tags': ['"Classic" ML', "Benford's Law"]},
 {'rest': ['  "Note on the frequency of use of the different digits in natural numbers" ',
   ' Simon Newcomb'],
  'year': 1881,
  'tags': ['"Classic" ML', "Benford's Law"]},
 {'rest': [' "The Scree Test For The Number Of Factors" ',
   ' Raymond B. Cattell'],
  'year': 1966,
  'tags': ['"Classic" ML', 'Scree plot']},
 {'rest': [' "On Measuring and Correcting the Effects of Data Mining and Model Selection" ',
   ' Jianming Ye'],
  'year': 1998,
  'tags': ['Learning theory / Deep learning theory / model compression / interpretability / Information Geometry',
   'generalized degrees of freedom']},
 {'rest': [' "On the stability of inverse problems" ',
   ' L2 regularization introduced by Tikhonov, original paper in Russian'],
  'year': 1943,
  'tags': ['Learning theory / Deep learning theory / model compression / interpretability / Information Geometry',
   'L1/

In [28]:
item

[{'year': 2021,
  'title_part': 'Classifier-Free Diffusion Guidance',
  'url': 'https://openreview.net/forum?id=qw8AKxfYbI',
  'authors': ['Jonathan Ho', 'Tim Salimans'],
  'stub_name': '2021_JonathanHo_ClassifierFreeDiffusionGuidanc',
  'tags': ['Misc important papers for generative models/art, misc modern era',
   'Classifier-free Guidance (CFG)']}]

In [12]:
for entry in parser.entries:
    print(entry['heading'], entry.keys())

"Classic" ML dict_keys(['heading', 'subheadings'])
Network Graphs / combinatorial optimization dict_keys(['heading', 'subheadings'])
Geometric Deep Learning and ML applications of group theory/representation theory dict_keys(['heading', 'subheadings'])
Misc optimization and numerical methods dict_keys(['heading', 'subheadings'])
Neural optimizers dict_keys(['heading', 'subheadings'])
Neural activations dict_keys(['heading', 'subheadings'])
Neural initializations dict_keys(['heading', 'subheadings'])
Neural layers dict_keys(['heading', 'subheadings'])
RL dict_keys(['heading', 'subheadings'])
Hyperparameter tuning / Architecture Search dict_keys(['heading', 'subheadings'])
Implicit Representation dict_keys(['heading', 'subheadings'])
Specific architectures/achievements, and other misc milestones dict_keys(['heading', 'subheadings'])
Learning theory / Deep learning theory / model compression / interpretability / Information Geometry dict_keys(['heading', 'subheadings'])
Information theory

In [None]:
for entry in parser.entries:
    if 'subheadings' not in entry: # yeesh...
        continue
    for item in entry['subheadings'].values():
        stub = Stub(item)

In [15]:
!python readme2book.py

[32m2021-10-28 20:07:34.627[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m241[0m - [34m[1msubheading: Lasso/elasticnet[0m
[32m2021-10-28 20:07:34.627[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m243[0m - [34m[1mitem:
 {'year': 1996, 'title_part': '"Regression Shrinkage and Selection via the Lasso"', 'url': 'https://statweb.stanford.edu/~tibs/lasso/lasso.pdf', 'is_pdf': 'True', 'authors': ['Robert Tibshirani'], 'stub_name': '1996_RobertTibshirani_RegressionShrinkageAndSelectio'}[0m
[32m2021-10-28 20:07:34.627[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m243[0m - [34m[1mitem:
 {'year': 2005, 'title_part': '"Regularization and variable selection via the elastic net"', 'url': 'https://web.stanford.edu/~hastie/Papers/B67.2%20(2005)%20301-320%20Zou%20&%20Hastie.pdf', 'is_pdf': 'True', 'authors': ['Hui Zou', 'Trevor Hastie'], 'stub_name': '2005_HuiZou_RegularizationAndVariableSelec'}[0m
[32m2021-10-28

[32m2021-10-28 20:07:34.642[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m241[0m - [34m[1msubheading: UMAP[0m
[32m2021-10-28 20:07:34.642[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m243[0m - [34m[1mitem:
 {'year': 2018, 'title_part': '"UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction"', 'url': 'https://arxiv.org/abs/1802.03426', 'authors': ['Leland McInnes', 'John Healy', 'James Melville'], 'stub_name': '2018_LelandMcinnes_UmapUniformManifoldApproximati'}[0m
[32m2021-10-28 20:07:34.642[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m241[0m - [34m[1msubheading: LSH[0m
[32m2021-10-28 20:07:34.643[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m243[0m - [34m[1mitem:
 {'year': 2014, 'title_part': '"LOCALITY PRESERVING HASHING"', 'url': 'https://faculty.ucmerced.edu/mhyang/papers/icip14_lph.pdf', 'is_pdf': 'True', 'authors': ['Yi-Hsuan Tsai', 'Ming-

[32m2021-10-28 20:07:34.684[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m241[0m - [34m[1msubheading: GPT-3[0m
[32m2021-10-28 20:07:34.684[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m243[0m - [34m[1mitem:
 {'year': 2020, 'title_part': '"Language Models are Few-Shot Learners"', 'url': 'https://arxiv.org/abs/2005.14165', 'authors': ['Tom B. Brown', 'Benjamin Mann', 'Nick Ryder', 'Melanie Subbiah', 'Jared Kaplan', 'Prafulla Dhariwal', 'Arvind Neelakantan', 'Pranav Shyam', 'Girish Sastry', 'Amanda Askell', 'Sandhini Agarwal', 'Ariel Herbert-Voss', 'Gretchen Krueger', 'Tom Henighan', 'Rewon Child', 'Aditya Ramesh', 'Daniel M. Ziegler', 'Jeffrey Wu', 'Clemens Winter', 'Christopher Hesse', 'Mark Chen', 'Eric Sigler', 'Mateusz Litwin', 'Scott Gray', 'Benjamin Chess', 'Jack Clark', 'Christopher Berner', 'Sam McCandlish', 'Alec Radford', 'Ilya Sutskever', 'Dario Amodei'], 'stub_name': '2020_TomBBrown_LanguageModelsAreFewShotLearne'}[0

In [5]:
for entry in parser.entries:
    if 'subheadings' not in entry: # yeesh...
        continue
    for recs in entry['subheadings'].values():
        for rec in recs:
            if not isinstance(rec, dict):
                print("non-dict record")
                print(rec)
                continue
            #if not rec['title_part'].startswith(' ["'):
            if not 'url' in rec:
                print("no url")
                print(rec)

no url
{'rest': [' "The Law of Anomalous Numbers" ', ' Frank Benford'], 'year': 1938}
no url
{'rest': ['  "Note on the frequency of use of the different digits in natural numbers" ', ' Simon Newcomb'], 'year': 1881}
no url
{'rest': [' "The Scree Test For The Number Of Factors" ', ' Raymond B. Cattell'], 'year': 1966}
no url
{'rest': [' "On Measuring and Correcting the Effects of Data Mining and Model Selection" ', ' Jianming Ye'], 'year': 1998}
no url
{'rest': [' "On the stability of inverse problems" ', ' L2 regularization introduced by Tikhonov, original paper in Russian'], 'year': 1943}
no url
{'rest': [' "Extensions of Lipschitz mappings into a Hilbert space" ', ' William B. Johnson, Joram Lindenstrauss'], 'year': 1984}
no url
{'rest': [' "Extensions of Lipschitz mappings into a Hilbert space" ', ' William B. Johnson, Joram Lindenstrauss'], 'year': 1984}


In [6]:
entry
# guessing the issue here is that there are multiple entries for a given subheading 
# and the parser currently assumes there's just one?

{'heading': 'Time series forecasting',
 'subheadings': defaultdict(list,
             {'RNN forecasting': [{'year': 1991,
                'title_part': '"Recurrent Networks and NARMA Modeling"',
                'url': 'https://proceedings.neurips.cc/paper/1991/file/5ef0b4eba35ab2d6180b0bca7e46b6f9-Paper.pdf',
                'is_pdf': 'True',
                'authors': ['J. Connor', 'L. Atlas', 'R. Martin'],
                'stub_name': '1991_JConnor_RecurrentNetworksAndNarmaModel'},
               {'year': 2017,
                'title_part': '"A Multi-Horizon Quantile Recurrent Forecaster"',
                'url': 'https://arxiv.org/pdf/1711.11053.pdf',
                'is_pdf': 'True',
                'authors': ['(Amazon) Ruofeng Wen',
                 'Kari Torkkola',
                 'Balakrishnan Narayanaswamy',
                 'Dhruv Madeka'],
                'stub_name': '2017_AmazonRuofengWen_AMultiHorizonQuantileRecurrent'}]})}

In [7]:
test_item = entry['subheadings']['RNN forecasting'][0]
test_stub = Stub(test_item)
print(str(test_stub))

---
jupytext:
  formats: md:myst
  text_representation:
    extension: .md
    format_name: myst
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# "Recurrent Networks and NARMA Modeling"

```{code-cell} ipython3
:tags: [hide-input]

import panel as pn
pn.extension()
pdf_pane = pn.pane.PDF('https://proceedings.neurips.cc/paper/1991/file/5ef0b4eba35ab2d6180b0bca7e46b6f9-Paper.pdf', width=700, height=1000)
pdf_pane
```


In [8]:
pwd

'C:\\Users\\shagg\\Documents\\projects\\anthology-of-modern-ml'

In [10]:
test_stub.write('anthology_of_modern_ml/stubs')

In [19]:
#parser.entries[:-1]
len(parser.entries) # 14
parser.entries[1] # looks like the first entry parsed correctly and everything else... didn't.

{'heading': 'Network Graphs / combinatorial optimization',
 'subheadings': defaultdict(list,
             {'modularity / louvain community detection': ['* 2004 - ["Finding community structure in very large networks"](https://arxiv.org/abs/cond-mat/0408187) - Aaron Clauset, M. E. J. Newman, Cristopher Moore',
               '* 2008 - ["Fast unfolding of communities in large networks"](https://arxiv.org/abs/0803.0476) - Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre'],
              'pagerank': ['* 1998 - ["The PageRank Citation Ranking: Bringing Order to the Web"](http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf) - Larry Page'],
              'label propagation': ['* 2002 - ["Learning From Labeled and Unlabeled Data With Label Propagation"](http://mlg.eng.cam.ac.uk/zoubin/papers/CMU-CALD-02-107.pdf) - Xiaojin Zhu, Zoubin Ghahramani']})}

In [42]:
# We're not quite there yet, but the last step will look something like this:
for entry in entries:
    if 'subheadings' not in entry: # yeesh...
        continue
    for subheading, recs in entry['subheadings'].items():
        for rec in recs:
            if not 'url' in rec:
                continue
            #path = f"{entry['heading']} / {subheading}"
            #print((path, rec))
            meta_rec = {
                'heading':entry['heading'],
                'subheading':subheading,
                **rec
            }
            rich.print(meta_rec)

In [36]:
for entry in entries:
    rich.print(entry)

In [31]:
errors

['* Uh... here there be dragons. Maybe just leave some breadcrumbs here?',
 '* Probably discussed sufficiently in the Adam paper',
 '* see backprop',
 '* See also AlexNet',
 '* see also Alex Graves 2013',
 '* see also knowledge distillation below',
 '* http://karpathy.github.io/2019/04/25/recipe/']