# Clean Bibliography

To goal of this notebook is to clean your `.bib` file to ensure that it only contains references that you have cited in your paper. This cleaned `.bib` will then be used to generate a data table of names that will be used to query the probabilistic gender classifier, [Gender API](https://gender-api.com). 

The only required file you need is your manuscript's bibliography in `.bib` format. __Your `.bib` must only contain references cited in the manuscript__. Otherwise, the estimated gender proportions will be inaccurate. 

If you are not using LaTeX, collect and organize only the references you have cited in your manuscript using your reference manager of choice (e.g. Mendeley, Zotero, EndNote, ReadCube, etc.) and export that selected bibliography as a `.bib` file. For those working in LaTeX, we can use an optional `.aux` file to automatically filter your `.bib` to check that it only contains entries which are cited in your manuscript.

## Import libraries, set paths, check settings

### Upload your .bib file(s) and optionally an .aux file generated from compiling your LaTeX manuscript.

![upload button](img/upload.png)

![confirm upload button](img/confirmUpload.png)

In [None]:
import numpy as np
import bibtexparser
from bibtexparser.bparser import BibTexParser
import glob
import subprocess
import os
from pybtex.database.input import bibtex
import csv


def checkcites_output(aux_file):
    # take in aux file for tex document, return list of citation keys
    # that are in .bib file but not in document

    result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE)
    result = result.stdout.decode('utf-8')
    unused_array_raw = result.split('\n')
    # process array of unused references + other output 
    unused_array_final = list()
    for x in unused_array_raw:
        if len(x) > 0: # if line is not empty
            if x[0] == '-':  # and if first character is a '-', it's a citation key
                unused_array_final.append(x[2:]) # truncate '- '            
    if "------------------------------------------------------------------------" in unused_array_final:
        return(result)
    else:
        return(unused_array_final)   
    
    
homedir = '/home/jovyan/'
bib_files = glob.glob(homedir + '*.bib')
paper_aux_file = glob.glob(homedir + '*.aux')
paper_bib_file = 'library_paper.bib'

### Define the _first_ and _last_ author of your paper.

For example: 
```
yourFirstAuthor = 'Teich, Erin G.'
yourLastAuthor = 'Bassett, Danielle S.'
```

In [None]:
yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial'
yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial'
optionalEqualContributors = 'LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'

if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or (yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'):
    raise ValueError("Please enter your manuscript's first and last author names")

if paper_aux_file:
    if optionalEqualContributors == ('LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):
        citing_authors = np.array([yourFirstAuthor, yourLastAuthor])
    else:
        citing_authors = np.array([yourFirstAuthor, yourLastAuthor, optionalEqualContributors])
    print(checkcites_output(paper_aux_file))
    unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper
    print("Unused citations: ", unused_in_paper.count('=>'))
    
    parser = BibTexParser()
    parser.ignore_nonstandard_types = False
    parser.common_strings = True
    
    bib_data = None
    for bib_file in bib_files:
        with open(bib_file) as bibtex_file:
            if bib_data is None:
                bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)
                # bib_data = bibtexparser.load(bibtex_file, parser)
            else:
                bib_data_extra = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)
                # bib_data_extra = bibtexparser.load(bibtex_file, parser)
                bib_data.entries_dict.update(bib_data_extra.entries_dict)
                bib_data.entries.extend(bib_data_extra.entries)
    
    all_library_citations = list(bib_data.entries_dict.keys())
    print("All citations: ", len(all_library_citations))
    
    for k in all_library_citations:
        if k in unused_in_paper:
            del bib_data.entries_dict[k] # remove from entries dictionary if not in paper
    
    #in_paper_mask = [x not in unused_in_paper for x in all_library_citations] # get mask of citations in paper
    in_paper_mask = [bib_data.entries[x]['ID'] not in unused_in_paper for x in range(len(bib_data.entries))]
    bib_data.entries = [bib_data.entries[x] for x in np.where(in_paper_mask)[0]] # replace entries list with entries only in paper
    del bib_data.comments
    
    duplicates = []
    for key in bib_data.entries_dict.keys():
        count = str(bib_data.entries).count(key)
        if count > 1:
            duplicates.append(key)
            
    if len(duplicates) > 0:
        raise ValueError("In your .bib file, please remove duplicate entries or duplicate entry ID keys for:", ' '.join(map(str, duplicates)))

    if os.path.exists(paper_bib_file):
        os.remove(paper_bib_file)
    
    with open(paper_bib_file, 'w') as bibtex_file:
        bibtexparser.dump(bib_data, bibtex_file)
    
    # define first author and last author names of citing paper -- will exclude citations of these authors
    # beware of latex symbols within author names
    # in_paper_citations = list(bib_data.entries_dict.keys())
    in_paper_citations = [bib_data.entries[x]['ID'] for x in range(len(bib_data.entries))] # get list of citation keys in paper
    
    # extract author list for every cited paper
    cited_authors = [bib_data.entries_dict[x]['author'] for x in in_paper_citations]
    # find citing authors in cited author list
    # using nested list comprehension, make a citing author -by- citation array of inclusion
    self_cite_mask = np.array([[citing_author in authors for authors in cited_authors] for citing_author in citing_authors])
    self_cite_mask = np.any(self_cite_mask,axis=0) # collapse across citing authors such that any coauthorship by either citing author -> exclusion
    
    print("Self-citations: ", [bib_data.entries[x]['ID'] for x in np.where(self_cite_mask)[0]]) # print self citations
    for idx,k in enumerate(in_paper_citations):
        if self_cite_mask[idx]:
            del bib_data.entries_dict[k] # delete citation from dictionary if self citationi
    bib_data.entries = [bib_data.entries[x] for x in np.where(np.invert(self_cite_mask))[0]] # replace entries list with entries that aren't self citations
    
    paper_bib_file_excl_sc = os.path.splitext(paper_bib_file)[0] + '_noselfcite.bib'
    
    if os.path.exists(paper_bib_file_excl_sc):
        os.remove(paper_bib_file_excl_sc)
    
    with open(paper_bib_file_excl_sc, 'w') as bibtex_file:
        bibtexparser.dump(bib_data, bibtex_file)
        
if os.path.exists('*_noselfcite.bib'):
    ID = glob.glob(homedir + paper_bib_file_excl_sc)
else:
    ID = glob.glob(homedir + '*bib')
    with open(ID[0]) as bibtex_file:
        bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)
    duplicates = []
    for key in bib_data.entries_dict.keys():
        count = str(bib_data.entries).count("'ID\': \'"+ key + "\'")
        if count > 1:
            duplicates.append(key)
            
    if len(duplicates) > 0:
        raise ValueError("In your .bib file, please remove duplicate entries or duplicate entry ID keys for:", ' '.join(map(str, duplicates)))

FA = []
LA = []
parser = bibtex.Parser()
bib_data = parser.parse_file(ID[0])
counter = 1
nameCount = 0
outPath = homedir + 'cleanedBib.csv'

if os.path.exists(outPath):
    os.remove(outPath)

with open(outPath, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])

for key in bib_data.entries.keys():
    try:
        author = bib_data.entries[key].persons['author']
    except:
        author = bib_data.entries[key].persons['editor']
    FA = author[0].rich_first_names
    LA = author[-1].rich_first_names

    if (yourFirstAuthor!='LastName, FirstName OptionalMiddleInitial') and (yourLastAuthor!='LastName, FirstName OptionalMiddleInitial'):
        selfCiteCheck1 = [s for s in author if yourLastAuthor in str([str(s.rich_last_names)[7:-3], str(s.rich_first_names)[7:-3]]).replace("'", "")]
        selfCiteCheck2 = [s for s in author if yourFirstAuthor in str([str(s.rich_last_names)[7:-3], str(s.rich_first_names)[7:-3]]).replace("'", "")]
        nameCount = 0
        if optionalEqualContributors != ('LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):
            for name in optionalEqualContributors:
                selfCiteCheck3 = [s for s in author if name in str([str(s.rich_last_names)[7:-3], str(s.rich_first_names)[7:-3]]).replace("'", "")]
                if len(selfCiteCheck3)>0:
                    nameCount += 1
        if len(selfCiteCheck1)+len(selfCiteCheck2)+nameCount > 0:
            selfCite = 'Y'
            print(str(counter) + ": " + key + " <-- self-citation")
        else:
            selfCite= 'N'
            print(str(counter) + ": " + key)
    else:
        selfCite = 'NA'

    with open(outPath, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        if selfCite=='N':
            writer.writerow([counter, str(FA)[7:-3], str(LA)[7:-3], bib_data.entries[key].fields['title'].replace(',', ''), selfCite, key])
    counter += 1