# Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 

The core python code is also in `pubsFromBibs.py`. 
Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
* bib file names
* specific venue keys based on your bib file preferences
* any specific pre-text for specific files
* Collection Name (future feature)

TODO: Make this work with other databases of citations, 
TODO: Merge this with the existing TSV parsing solution

In [1]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex 
from time import strptime
import string
import html
import os
import re
import warnings

In [2]:
my_name = 'Francesco Luzi'

In [3]:
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
# publist = {
#     "proceeding": {
#         "file" : "proceedings.bib",
#         "venuekey": "booktitle",
#         "venue-pretext": "In the proceedings of ",
#         "collection" : {"name":"publications",
#                         "permalink":"/publication/"}
        
#     },
#     "journal":{
#         "file": "pubs.bib",
#         "venuekey" : "journal",
#         "venue-pretext" : "",
#         "collection" : {"name":"publications",
#                         "permalink":"/publication/"}
#     } 
# }

publist = {
    "proceeding": {
        "file" : "FrancescoPublications.bib",
        "venuekey": "booktitle",
        "venue-pretext": "In the proceedings of ",
        "collection" : {"name":"publications",
                        "permalink":"/publication/"}
        
    }
}

In [4]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    '"': "",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)

In [5]:
for pubsource in publist:
    parser = bibtex.Parser()
    bibdata = parser.parse_file(publist[pubsource]["file"])

    #loop through the individual references in a given bibtex file
    for bib_id in bibdata.entries:
        #reset default date
        pub_year = "1900"
        pub_month = "01"
        pub_day = "01"
        
        b = bibdata.entries[bib_id].fields
        
        try:
            pub_year = f'{b["year"]}'
            
            #Set the venue key (proceedings or article) and then set the pretext
            if bibdata.entries[bib_id].type == 'inproceedings':
                venuekey      = "booktitle"
                venue_pretext = "In the proceedings of "
            elif bibdata.entries[bib_id].type == 'article':
                venuekey      = "journal"
                venue_pretext = ""
            elif bibdata.entries[bib_id].type == 'techreport':
                venuekey      = "institution"
                venue_pretext = "Technical report at "
            else:
                raise Exception('Unknown entry type') 

            #todo: this hack for month and day needs some cleanup
            if "month" in b.keys(): 
                if(len(b["month"])<3):
                    pub_month = "0"+b["month"]
                    pub_month = pub_month[-2:]
                elif(b["month"] not in range(12)):
                    tmnth = strptime(b["month"][:3],'%b').tm_mon   
                    pub_month = "{:02d}".format(tmnth) 
                else:
                    pub_month = str(b["month"])
            if "day" in b.keys(): 
                pub_day = str(b["day"])

                
            pub_date = pub_year+"-"+pub_month+"-"+pub_day
            
            #strip out {} as needed (some bibtex entries that maintain formatting)
            clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

            url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
            url_slug = url_slug.replace("--","-")

            md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
            html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

            #Get url
            url = False
            if "url" in b.keys():
                if len(str(b["url"])) > 5:
#                     md += "\npaperurl: '" + b["url"] + "'"
                    url = True
            
            if url:
                url_to_paper = b["url"]
#                 md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n" 
            else:
                url_to_paper = "https://scholar.google.com/scholar?q=" + html.escape(clean_title.replace("-","+"))
#                 md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"
            
            #Build Citation from text
            citation = ""
    
            #Citation authors
            no_name = True
            for author in bibdata.entries[bib_id].persons["author"]:
                #If this is the last authors use ", and" instead of ","
                if author == bibdata.entries[bib_id].persons["author"][-1]:
                    citation += ', and'
                #If it is the first author, do not prepend anything
                elif author == bibdata.entries[bib_id].persons["author"][0]:
                    pass
                else:
                    citation += ','
                
                #Highlight the primary author
                author_name  = author.first_names[0] + " "
                author_name += '' if author.middle_names == [] else author.middle_names[0][0] + '. '
                author_name += author.last_names[0]
                if author == pybtex.database.Person(my_name):
                    no_name = False
                    citation = citation + " <strong>" + author_name + "</strong>"
                else:
                    citation = citation + " " + author_name
            citation += '. ' #Period at the end
                    
                
            if no_name:
                warnings.warn(my_name + ' was NOT found...')

            #citation title
            citation += "<a href=\"" + url_to_paper + "\">" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + "</a>."
#             citation += "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

#             add venue logic depending on citation type
#             venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")
            venue = venue_pretext + b[venuekey].replace("{", "").replace("}","").replace("\\","")
    
            citation = citation + " " + html_escape(venue)
            citation = citation + ", " + pub_year + "."
            
            ## YAML variables
            md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + "\"\n"
            
            md += """collection: """ +  publist[pubsource]["collection"]["name"]

            md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"]  + html_filename
            
            note = False
            if "note" in b.keys():
                if len(str(b["note"])) > 5:
                    md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
                    note = True

            md += "\ndate: " + str(pub_date) 

            md += "\nvenue: '" + html_escape(venue) + "'"

#             md += "\ncitation: '" + html_escape(citation) + "'"
            md += "\ncitation: '" + citation + "'"

            md += "\n---"

            if url:
                md += "\npaperurl: '" + b["url"] + "'"
            
            ## Markdown description for individual page
            if note:
                md += "\n" + html_escape(b["note"]) + "\n"

            
            c = os.path.basename(md_filename)

            with open("../_publications/" + md_filename, 'w') as f:
                f.write(md)
            print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
        # field may not exist for a reference
        except KeyError as e:
            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
            continue


SUCESSFULLY PARSED stevens2018sub: " A sub-sampled approach to extremely low-dose {STEM}  "
SUCESSFULLY PARSED mehdi2019controlling: " Controlling the spatio-temporal dose distribution during {ST ... "
SUCESSFULLY PARSED luzi2016acoustic: " Acoustic firearm discharge detection and classification in a ... "
SUCESSFULLY PARSED dar2020subspace: " Subspace fitting meets regression: {T}he effects of supervis ... "
SUCESSFULLY PARSED good2015emat: " {EMAT} for Rapid Screening of Waste Storage Tanks  "
SUCESSFULLY PARSED larche2015inspection: " Inspection of Hanford’s Double-Shell Waste Tanks Using Elect ... "
SUCESSFULLY PARSED glass2016inspection: " Inspection Technology Advancements for Hanford Double Shell  ... "
SUCESSFULLY PARSED luzi2017resolution: " Resolution versus error for computational electron microscop ... "
SUCESSFULLY PARSED luzi2021evaluating: " Evaluating generative networks using {G}aussian mixtures of  ... "
SUCESSFULLY PARSED luzi2021double: " Double descent and other in