# Create author metadata & markdown section templates with PSB attendees

In [1]:
import re

import tika.parser
import yaml

## Read PDF to text and parse

`attendees.pdf` was downloaded from https://psb.stanford.edu/conference-materials/attendees.pdf.

In [2]:
# https://stackoverflow.com/a/48673754
parsed = tika.parser.from_file('attendees.pdf')
text = parsed["content"]

In [3]:
text = '\n'.join(line.strip() for line in text.splitlines())
stanzas = re.split(r"\n\n+", text)

In [4]:
remove = {
    '',
    'PSB participant list2',
    'Pacific Symposium on Biocomputing 2019\nParticipant List',
    'as of December 6, 2018',
}
stanzas = [stanza for stanza in stanzas if stanza not in remove]

## Create author metadata.yaml template

In [5]:
author_template = {
    'name': None,
    'orcid': None,
    'github': None,
    'twitter': None,
    'affiliations': [],
}

In [6]:
author_metadata = list()
for stanza in stanzas:
    name, affiliation = stanza.split('\n', 1)
    affiliation = ' '.join(affiliation.split())
    author = author_template.copy()
    author['name'] = name
    author['affiliations'] = [affiliation]
    author_metadata.append(author)

In [7]:
author_metadata[:2]

[{'name': 'Max Alekseyev',
  'orcid': None,
  'github': None,
  'twitter': None,
  'affiliations': ['George Washington University']},
 {'name': 'Russ Altman',
  'orcid': None,
  'github': None,
  'twitter': None,
  'affiliations': ['Stanford University']}]

In [8]:
# Respect dictionary order https://stackoverflow.com/a/52621703/4651668
yaml.add_representer(dict, lambda self, data: yaml.representer.SafeRepresenter.represent_dict(self, data.items()))

with open('author-metadata.yaml', 'w') as write_file:
    yaml.dump(author_metadata, write_file, default_flow_style=False)

## Create markdown section template

In [9]:
with open('author-sections.md', 'w') as write_file:
    for author in author_metadata:
        write_file.write(f'<!-- {author["name"]} -->\n\n\n\n')