# Publications markdown generator

Takes a list of publications in .bib format and converts them for use with Jupyter notebook `publication.ipynb`. Run it from the `markdown_generator` folder after replacing `publications.bib` with one containing your data.  Journal entries will be saved in the folder `./_publications`

Note: need to install the `pybtex` package to parse the .bib file
```
pip install pybtex
```

## View the .bib file
Read the .bib file, and create a Pandas' `DataFrame` file to view the entries

In [5]:
import os
import calendar
from datetime import datetime
from pybtex.database import parse_file
import pandas as pd

bib_data = parse_file('publications.bib')
#print(bib_data)

# view the data in Pandas' DataFrame
df = pd.DataFrame(columns = ['entry', 'year', 'journal', 'title'])
for entry in bib_data.entries:
    f = bib_data.entries[entry].fields
    # df = df.append({'entry' : entry, 'year' : f['year'], 'journal': f['journal'],
                            # 'title': f['title'][1:-1]}, ignore_index = True) 
    df = pd.concat([df, pd.DataFrame([{
        'entry': entry,
        'year': f['year'],
        'journal': f['journal'],
        'title': f['title'][1:-1]
        }])], ignore_index=True)
df_sort = df.sort_values(by=['year'], ascending=False)
#df_sort.to_csv('pub.csv', index=False)
df_sort

Unnamed: 0,entry,year,journal,title
0,Ding2025,2025,Science Advances,Demystifying the drivers of the spring warming...
3,Chen2025,2025,Journal of Climate,Effective Time Scale of the Northern Hemispher...
1,Ge2025b,2025,Journal of Advances in Modeling Earth Systems,Moist Energy Constraints on Surface Temperatur...
2,Ge2025a,2025,Environmental Pollution,The short-term comprehensive impact of the pha...
5,Ding2024,2024,Environmental Research: Climate,North American cooling signature of strong str...
...,...,...,...,...
62,Chen2008b,2008,Journal of the Atmospheric Sciences,The Tropospheric Jet Response to Prescribed Zo...
61,Chen2008,2008,Journal of Climate,Phase Speed Spectra and the Latitude of Surfac...
63,Chen2007a,2007,Journal of the Atmospheric Sciences,Sensitivity of the Latitude of the Surface Wes...
95,Chen2007b,2007,Geophysical Research Letters,Phase speed spectra and the recent poleward sh...


## Functions
Defining the functions to get the information about filenames, authors, and journals

### Escape special characters

YAML is very picky about how it takes a valid string, so we are replacing single and double quotes (and ampersands) with their HTML encoded equivilents. This makes them look not so readable in raw format, but they are parsed and rendered nicely.

In [6]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)

### get_filnames()

In [7]:
def get_filenames(entry):
    year  = bib_data.entries[entry].fields['year']
    if 'month' in bib_data.entries[entry].fields:
        month = list(calendar.month_abbr).index(bib_data.entries[entry].fields['month'].capitalize())
    else:
        print('no month information is found for ' + entry)
        month = datetime.now().month                # use current month if no month information is found
        print('use the current month: ' + str(month))
    pub_date = str(year) + "-" + str(month) + "-" + "1"
    
    md_filename   = pub_date + "-" + entry + ".md"
    html_filename = pub_date + "-" + entry
    #print(md_filename)
    
    return year, month, md_filename, html_filename

#get_filenames('Guan2021')

### get_author()

In [8]:
def get_author(entry):
    num_author = len(bib_data.entries[entry].persons['author'])
    for count, author in enumerate(bib_data.entries[entry].persons['author']):
        if count == 0:
            author_list = author.last_names[0] + ", "
            for cc, first in enumerate(author.bibtex_first_names):
                if cc <len(author.bibtex_first_names)-1:
                    author_list += first + " "
                else:
                    author_list += first
        elif count < num_author-1:
            author_list += ", " 
            for cc, first in enumerate(author.bibtex_first_names):
                if cc <len(author.bibtex_first_names)-1:
                    author_list += first + " "
                else:
                    author_list += first
            author_list += " " + author.last_names[0]
        else:
            author_list += " and " 
            for cc, first in enumerate(author.bibtex_first_names):
                if cc <len(author.bibtex_first_names)-1:
                    author_list += first + " "
                else:
                    author_list += first
            author_list += " " + author.last_names[0]
        author_list = author_list.replace("{","")
        author_list = author_list.replace("}","")
    #print(author_list)
    
    return author_list

#get_author('Guan2021')

### get_journal()

In [9]:
def get_journal(entry):
    journal = bib_data.entries[entry].fields['journal']
    cit_journal = "<i>" + journal + "</i>"
    if 'volume' in bib_data.entries[entry].fields:
        cit_journal += ", " + bib_data.entries[entry].fields['volume']
    if 'pages' in bib_data.entries[entry].fields:
        cit_journal += ", " + bib_data.entries[entry].fields['pages']
    if 'doi' in bib_data.entries[entry].fields:
        cit_journal +=  ", doi:" + bib_data.entries[entry].fields['doi']
    cit_journal += "."
#    print(cit_journal)
    
    title    = html_escape(bib_data.entries[entry].fields['title'][1:-1])
    paper_url = bib_data.entries[entry].fields['url']
    
    if 'abstract' in bib_data.entries[entry].fields: 
        excerpt = bib_data.entries[entry].fields['abstract']
    else: 
        excerpt = ""

    return journal, cit_journal, title, paper_url, excerpt

#get_journal('Guan2021')

## Generate `.md` entries from the `.bib` file
Two types of output
* if the `.md` entry does not exist, then create a new entry
* if the `.md` entry exists but is different from the one from `.bib`, then create a new entry with `.new`

In [10]:
for i_entry, entry in enumerate(bib_data.entries):
# create the name of each md file
    year, month, md_filename, html_filename = get_filenames(entry)

# set up author names
    author_list = get_author(entry)

# set up the citation for the publishing journal
    journal, cit_journal, title, paper_url, excerpt = get_journal(entry)

    citation = html_escape(author_list + ", " + str(year) + ": " + title + ", " + cit_journal)
        
## YAML variables
    
    md = "---\ntitle: \"" + title + '"\n'
    
    md += """collection: publications"""
    
    md += """\npermalink: /publication/""" + html_filename

    md += "\nyear: " + str(year) 

    md += "\nauthor: " + str(author_list) 

    md += "\nvenue: '" + journal + "'"
    
    md += "\nvenue_cit: '" + cit_journal + "'"
    
#    md += "\npaperurl: '" + paper_url + "'"
    
    md += "\ncitation: '" + citation + "'"
    
    md += "\n---"
    
## Markdown description for individual page
    excerpt = excerpt.replace("Abstract", "ABSTRACT:\n")
    if len(str(excerpt)) > 5:
        md += "\n" + html_escape(excerpt) + "\n"
    
    if len(str(paper_url)) > 5:
#        md += "\nDownload paper: [here](" + download_filename + ") and [journal website](" + paper_url + ")\n" 
         md += "\nDownload paper: [journal website](" + paper_url + ")\n"        
    md_filename = os.path.basename(md_filename)

## Output information
    if not os.path.exists("../_publications/" + md_filename):
        print("#" + str(i_entry)  + ", " + entry + ": write new entry!")
        with open("../_publications/" + md_filename, 'w') as f:
            f.write(md)
    else:
        with open("../_publications/" + md_filename, 'r') as f:
            md_exist = f.read()
            if (md != md_exist):
                print("#" + str(i_entry) + ", " + entry + ": found differences")
                # rewrite the old file is needed
                # print("#" + str(i_entry) + ", " + entry + ": found differences and created new entry!")
                # with open("../_publications/" + md_filename + ".new", 'w') as f:
                #     f.write(md)
                # os.system("diff " + "../_publications/" + md_filename + ".new "
                #                   + "../_publications/" + md_filename)
        
    #print(md)

print('Total number of entries: ' + str(i_entry+1))
    

#0, Ding2025: write new entry!
#1, Ge2025b: write new entry!
#2, Ge2025a: write new entry!
#3, Chen2025: write new entry!
#8, Ma2024a: found differences
#9, Ma2023a: found differences
#12, Ding2023: found differences
#15, Zhang2023: found differences
#16, Jiang2022: found differences
#19, Ding2022: found differences
#25, Ma2021: found differences
Total number of entries: 97


## Overide the entries with differences

In [None]:
# updating the entries
files = os.listdir("../_publications")
for file in files:
    if(file.endswith('.new')):
        print("replacing " + file[:-4])
        os.system("mv ../_publications/" + file + " ../_publications/"+ file[:-4])

## Other entries
copy other entries manually to the folders ./files and ./publications

In [None]:
if not os.path.exists("../_publications/2007-7-1-Chen2007.md"):
    os.system("cp thesis.pdf ../files")
    os.system("cp 2007-7-1-Chen2007.md ../_publications")