In [185]:
from datetime import datetime, timedelta, timezone
import os
import io

import dr_util.file_utils as fu

In [192]:
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose

In [195]:
with initialize(config_path="../configs/", version_base=None):
    cfg = compose(config_name="paper_data.yaml")

In [197]:
cfg_resolved = OmegaConf.to_container(cfg, resolve=True)

In [200]:
print(f"Configuration: \n\n{OmegaConf.to_yaml(cfg_resolved)}")

Configuration: 

data_dir: /Users/daniellerothermel/drotherm/data/
raw_pdf_dir: /Users/daniellerothermel/drotherm/data/raw_pdfs/
parsed_pdf_dir: /Users/daniellerothermel/drotherm/data/parsed_pdfs/
metadata_dir: /Users/daniellerothermel/drotherm/data/parsed_pdfs/
author_data_dir: /Users/daniellerothermel/drotherm/data/author_data/
author_info: /Users/daniellerothermel/drotherm/data/author_data/manual_profiles.json



In [189]:
AUTHOR_INFO = fu.load_file(f"{AUTHOR_DATA_DIR}manual_profiles.json")

In [203]:
print("Authors:")
for k in AUTHOR_INFO.keys():
    print(f" - {k}")

Authors:
 - He He
 - Eunsol Choi
 - Mengye Ren
 - Rajesh Ranganath
 - Tal Linzen
 - Kyunghyun Cho
 - Lerrel Pinto
 - Pavel Izmailov


In [191]:
print(AUTHOR_INFO["He He"])


Assistant Professor of Computer Science and Data Science

Bio: He He is an Assistant Professor in Computer Science and Data Science. She is broadly interested in natural language process and machine learning. Her recent research focuses on understanding large language models, improving their trustworthiness, and human-AI interaction. Prior to joining NYU, she obtained her PhD from University of Maryland, did a post-doc at Stanford, and spent one year at AWS working in dialogue platforms.

Research Areas:

- Machine learning
- Deep learning
- Natural language processing

I want to build intelligent systems that can communicate with humans effectively and enable individuals to achieve their goals. Today’s systems are often opaque, brittle, and difficult to control, which limits their usefulness in human-centered applications. To make them our trustworthy collaborators, my research aims to (i) understand the computational foundation of generalization in novel scenarios, and (ii) build in

In [None]:
## Utils

In [5]:
def get_author_metadata_path(author):
    assert author in AUTHORS
    return f'{METADATA_DIR}{author.replace(" ", "_")}_query_metadata.json'

def get_author_metadata(author):
    md_path = get_author_metadata_path(author)
    md = fu.load_file(md_path)
    return md

In [7]:
def get_parsed_pdf_path(pdf_name):
    return f'{PARSED_PDF_DIR}{pdf_name}.pkl'

def get_parsed_pdf(pdf_name):
    ppdf_path = get_parsed_pdf_path(pdf_name)
    if os.path.exists(ppdf_path):
        return fu.load_file(ppdf_path)
    return None

In [6]:
def get_author_parsed_papers(author):
    md = get_author_metadata(author)
    pdfs_dict = md['pdfs_metadata']
    parsed_pdfs_dict = []
    for pdf_name, pdf_data in pdfs_dict.items():
        ppdf = get_parsed_pdf(pdf_name)
        if ppdf is None:
            continue
        ppdf_dict = {**pdf_data}
        ppdf_dict['parsed_pdf'] = ppdf
        parsed_pdfs_dict.append(ppdf_dict)
    return parsed_pdfs_dict

In [None]:
## Load Parsed, Extract Structure

In [15]:
parsed_pdfs_pavel = get_author_parsed_papers(AUTHORS[0])
print(f">> Number of parsed papers for {AUTHORS[0]}: {len(parsed_pdfs_pavel)}")

>> Number of parsed papers for Pavel Izmailov: 17


In [18]:
test_ppdf = parsed_pdfs_pavel[0]

In [23]:
print(test_ppdf['title'])
print(test_ppdf['published'])
print(test_ppdf['authors'])
print(test_ppdf['pdf_link'])
print(f">> Num blocks in parsed pdf: {len(test_ppdf['parsed_pdf'])}")

Can a Confident Prior Replace a Cold Posterior?
2024-03-02T17:28:55Z
['Martin Marek', 'Brooks Paige', 'Pavel Izmailov']
http://arxiv.org/pdf/2403.01272v1
>> Num blocks in parsed pdf: 19


### Utils

In [157]:
def reconstruct_split_text(split_text, verbose=False):
    buff = io.StringIO()
    for section in split_text:
        if verbose:
            buff.write(f"\n\n ===== Heading: {section['heading']} \n\n")
        buff.write("\n\n".join(section['lines']))
        buff.write("\n\n")
    return buff.getvalue()

In [107]:
def split_by_heading(text, title):
    tls = text.split("\n")
    title_str = f"# {title}"
    sections = []

    start_tl_strip = tls[0].strip()
    if title_str in start_tl_strip or start_tl_strip[0] != "#":
        start_heading = "From Previous Block"
        start_lines = []
    else:
        start_heading = start_tl_strip[2:]
        start_lines = [start_tl_strip]
        
    curr_section = {"heading": start_heading, "lines": start_lines}
    for tl in tls[1:]:
        tl_strip = tl.strip()
        if len(tl_strip) == 0 or tl_strip[0].isdigit():
            continue

        if tl_strip[0] == "#":
            # Drop all header mentions of the title, we'll add it back in
            if title_str in tl_strip:
                continue
            # Otherwise start a new section
            sections.append(curr_section)
            curr_section = {"heading": tl_strip[2:], "lines": []}
        curr_section['lines'].append(tl_strip)
    
    sections.append(curr_section)
    return sections

In [137]:
def get_all_sects(input_ppdf, input_title):
    all_sects = []
    for i, block in enumerate(input_ppdf):
        sects = split_by_heading(block.text, input_title)
        if i == 0:
            # Drop the title section
            all_sects.extend(sects[1:])
        else:
            all_sects.extend(sects)
    return all_sects

In [135]:
def group_sections(sections):
    grouped_sections = []

    figs = []
    last_was_fig = False
    for section in sections:
        if len(section['lines']) == 0:
            continue
            
        heading = section['heading']
        
        # For ease of reading split the starting case out
        if len(grouped_sections) == 0:
            grouped_sections.append({
                'heading': heading,
                'lines': [],
            })
            
        if heading.startswith("Figure"):
            figs.append(section)
            last_was_fig = True
            continue

        if last_was_fig:
            last_was_fig = False
            if len(section['lines']) == 0:
                print(section)
                assert False
            if len(section['lines'][0]) == 0:
                print(section)
                assert False
            if section['lines'][0][0].islower():
                first_l = f"{section['heading']} {section['lines'][0]}"
                grouped_sections[-1]['lines'].append(first_l)
                grouped_sections[-1]['lines'].extend(section['lines'][1:])
                continue
        
        if (heading != "From Previous Block" and
            grouped_sections[-1]['heading'] != heading
        ):
            grouped_sections.append({
                'heading': heading,
                'lines': [],
            })
        grouped_sections[-1]['lines'].extend(section['lines'])    
    return grouped_sections, figs

In [146]:
def ppdf_to_body_refs_figs(input_ppdf):
    all_s = get_all_sects(input_ppdf['parsed_pdf'], input_ppdf['title'])
    print(f">> There are {len(all_s)} sections total.")

    grouped_s, figs_s = group_sections(all_s)
    print(f">> There are {len(grouped_s)} grouped sections and {len(figs_s)} figures.")

    body_s = []
    references = None
    for s in grouped_s:
        if 'References' in s['heading']:
            references = s
            break
        body_s.append(s)
    return body_s, figs_s, references

## Test Full Flow

In [147]:
bd_s, fg_s, rfs = ppdf_to_body_refs_figs(test_ppdf)

>> There are 76 sections total.
>> There are 49 grouped sections and 8 figures.


In [161]:
# print(reconstruct_split_text(bd_s + fg_s))
# rfs

In [148]:
for gt in bd_s:
    print(f"{len(gt['lines']):2} | {gt['heading']}")

 2 | Abstract
 2 | 1. Introduction
10 | In a regression setting
 2 | 2. Background
 7 | 2.1. Bayesian neural networks
 3 | 2.3. What is wrong with cold posteriors?
 1 | 3. Related work
 2 | Cold posteriors and data augmentation.
 3 | Label noise.
 2 | Prior misspecification.
 5 | Improved prior distributions.
 4 | 4. Confidence of a Normal prior
 6 | 5. Dirichlet prior
19 | log density
 1 | Train accuracy
22 | Test accuracy
 3 | 7. Clipped Dirichlet prior
 2 | 7.1. Training stability
12 | 1.0
22 | 8. Confidence priorpost. density
 6 | 9. Discussion
 2 | Summary
 2 | Impact statement
 2 | Acknowledgements


### Sub Section Tests

In [138]:
all_sections_test = get_all_sects(test_ppdf['parsed_pdf'], test_ppdf['title'])
print(f">> There are {len(all_sections_test)} sections total")

>> There are 76 sections total


In [139]:
grouped_test, figs_test = group_sections(all_sections_test)
print(f">> There are {len(grouped_test)} grouped sections")

>> There are 49 grouped sections


In [145]:
for gt in grouped_test:
    print(f"{len(gt['lines']):3} | {gt['heading']}")

  2 | Abstract
  2 | 1. Introduction
 10 | In a regression setting
  2 | 2. Background
  7 | 2.1. Bayesian neural networks
  3 | 2.3. What is wrong with cold posteriors?
  1 | 3. Related work
  2 | Cold posteriors and data augmentation.
  3 | Label noise.
  2 | Prior misspecification.
  5 | Improved prior distributions.
  4 | 4. Confidence of a Normal prior
  6 | 5. Dirichlet prior
 19 | log density
  1 | Train accuracy
 22 | Test accuracy
  3 | 7. Clipped Dirichlet prior
  2 | 7.1. Training stability
 12 | 1.0
 22 | 8. Confidence priorpost. density
  6 | 9. Discussion
  2 | Summary
  2 | Impact statement
  2 | Acknowledgements
 38 | References
  9 | Appendix outline
  3 | A. Model comparison
  1 | B. Further discussion of DirClip prior
 23 | B.1. Clipping value is reached
  8 | B.2. Low likelihood
  2 | B.3. Fine-tuning has converged
  3 | D. Proof that confidence prior converges to a cold likelihood
 17 | Tempered categ. likelihood
  3 | E.2. Proof that Dirichlet diverges
 16 | F. Di

## Putting it All Together

In [180]:
def make_author_page(author):
    bio = AUTHOR_INFO[author]
    
    buff = io.StringIO()
    buff.write(f"# Research Summary for {author}\n\n")
    buff.write(f"## Bio\n{bio}\n\n")
    

    buff.write("## Recent Papers\n\n")
    parsed_pdfs_author = get_author_parsed_papers(author)
    for ppdf in parsed_pdfs_author:
        buff.write(f"# Title: {ppdf['title']}\n Published: {ppdf['published']}\n")
        buff.write("Authors: " + ", ".join(ppdf['authors']) + "\n\n")

        bd_s, fg_s, rfs = ppdf_to_body_refs_figs(ppdf)
        buff.write(reconstruct_split_text(bd_s + fg_s))# + [rfs]))
        buff.write(f"\n\n -------------- End Paper: {ppdf['title']}")
    return buff.getvalue()

In [181]:
bio_and_one_paper = make_author_page(AUTHORS[0])

>> There are 76 sections total.
>> There are 49 grouped sections and 8 figures.
>> There are 144 sections total.
>> There are 115 grouped sections and 19 figures.
>> There are 95 sections total.
>> There are 67 grouped sections and 7 figures.
>> There are 120 sections total.
>> There are 96 grouped sections and 21 figures.
>> There are 103 sections total.
>> There are 92 grouped sections and 4 figures.
>> There are 145 sections total.
>> There are 134 grouped sections and 6 figures.
>> There are 90 sections total.
>> There are 88 grouped sections and 1 figures.
>> There are 25 sections total.
>> There are 23 grouped sections and 1 figures.
>> There are 184 sections total.
>> There are 140 grouped sections and 11 figures.
>> There are 155 sections total.
>> There are 141 grouped sections and 10 figures.
>> There are 114 sections total.
>> There are 104 grouped sections and 6 figures.
>> There are 103 sections total.
>> There are 72 grouped sections and 5 figures.
>> There are 47 section

In [183]:
fu.dump_file(bio_and_one_paper, '/Users/daniellerothermel/drotherm/data/pavel_izmailov_summary_markdown.txt')

True

In [184]:
for author in AUTHORS:
    bio_and_one_paper = make_author_page(author)
    fu.dump_file(bio_and_one_paper, f'/Users/daniellerothermel/drotherm/data/{author.replace(" ", "_").lower()}_summary_markdown.txt', verbose=True)
    

>> There are 76 sections total.
>> There are 49 grouped sections and 8 figures.
>> There are 144 sections total.
>> There are 115 grouped sections and 19 figures.
>> There are 95 sections total.
>> There are 67 grouped sections and 7 figures.
>> There are 120 sections total.
>> There are 96 grouped sections and 21 figures.
>> There are 103 sections total.
>> There are 92 grouped sections and 4 figures.
>> There are 145 sections total.
>> There are 134 grouped sections and 6 figures.
>> There are 90 sections total.
>> There are 88 grouped sections and 1 figures.
>> There are 25 sections total.
>> There are 23 grouped sections and 1 figures.
>> There are 184 sections total.
>> There are 140 grouped sections and 11 figures.
>> There are 155 sections total.
>> There are 141 grouped sections and 10 figures.
>> There are 114 sections total.
>> There are 104 grouped sections and 6 figures.
>> There are 103 sections total.
>> There are 72 grouped sections and 5 figures.
>> There are 47 section