In [None]:
import numpy as np
import bibtexparser
import glob
import subprocess
import os

def checkcites_output(aux_file):
    # take in aux file for tex document, return list of citation keys
    # that are in .bib file but not in document

    result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE)
	result = result.stdout.decode('utf-8')
	unused_array_raw = result.split('\n')
	# process array of unused references + other output 
	unused_array_final = list()
	for x in unused_array_raw:
		if len(x) > 0: # if line is not empty
			if x[0] == '-':  # and if first character is a '-', it's a citation key
				unused_array_final.append(x[2:]) # truncate '- '            
	if "------------------------------------------------------------------------" in unused_array_final:
		return(result)
	else:
		return(unused_array_final)   

# for check cites to work, the .bib file must exist in the path specified in the .tex file used to generate the .aux file
homedir = '/home/jovyan/'
bib_file = glob.glob(homedir + '*.bib')
paper_aux_file = glob.glob(homedir + '*.aux') #name of aux file for paper
paper_bib_file = 'library_paper.bib' # name of .bib file output

print(checkcites_output(paper_aux_file))
unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper
print("Unused citations: ", len(unused_in_paper))

parser = BibTexParser()
parser.ignore_nonstandard_types = False

bib_data = None
for bib_file in bib_files:
    with open(bib_file) as bibtex_file:
        if bib_data is None:
            bib_data = bibtexparser.load(bibtex_file)
        else:
            bib_data_extra = bibtexparser.load(bibtex_file, parser)
            bib_data.entries_dict.update(bib_data_extra.entries_dict)
            bib_data.entries.extend(bib_data_extra.entries)

all_library_citations = list(bib_data.entries_dict.keys())
print("All citations: ", len(all_library_citations))

for k in all_library_citations:
	if k in unused_in_paper:
		#bib_data.entries.pop(k)
		del bib_data.entries_dict[k] # remove from entries dictionary if not in paper

in_paper_mask = [bib_data.entries[x]['ID'] not in unused_in_paper for x in range(len(bib_data.entries))]

bib_data.entries = [bib_data.entries[x] for x in np.where(in_paper_mask)[0]] # replace entries list with entries only in paper

with open(paper_bib_file[0], 'w') as bibtex_file:
    bibtexparser.dump(bib_data, bibtex_file)

# remove self-citations (defined as cited papers for which
# either the first or last author of the citing paper was a co-author)
# from consideration

# define first author and last author names of citing paper -- will exclude citations of these authors
# beware of latex symbols within author names
citing_authors = np.array(['Teich, Erin G.', 'Bassett, Danielle S.'])
#in_paper_citations = list(bib_data.entries_dict.keys())
in_paper_citations = [bib_data.entries[x]['ID'] for x in range(len(bib_data.entries))] # get list of citation keys in paper

# extract author list for every cited paper
cited_authors = [bib_data.entries_dict[x]['author'] for x in in_paper_citations]
# find citing authors in cited author list
# using nested list comprehension, make a citing author -by- citation array of inclusion
self_cite_mask = np.array([[citing_author in authors for authors in cited_authors] for citing_author in citing_authors])
self_cite_mask = np.any(self_cite_mask,axis=0) # collapse across citing authors such that any coauthorship by either citing author -> exclusion

print("Self-citations: ", [bib_data.entries[x]['ID'] for x in np.where(self_cite_mask)[0]]) # print self citations
for idx,k in enumerate(in_paper_citations):
    if self_cite_mask[idx]:
        del bib_data.entries_dict[k] # delete citation from dictionary if self citationi
bib_data.entries = [bib_data.entries[x] for x in np.where(np.invert(self_cite_mask))[0]] # replace entries list with entries that aren't self citations

paper_bib_file_excl_sc = os.path.splitext(paper_bib_file)[0] + '_noselfcite.bib'

with open(paper_bib_file_excl_sc, 'w') as bibtex_file:
    bibtexparser.dump(bib_data, bibtex_file)

In [None]:
from pybtex.database.input import bibtex
import glob
import csv

yourFirstAuthor = 'LastName, FirstName'
yourLastAuthor = 'LastName, FirstName'

baseDirectory = "/home/jovyan/"
ID = glob.glob(baseDirectory + '*bib')
print(ID)  # Print number of files read

FA = []
LA = []

parser = bibtex.Parser()
bib_data = parser.parse_file(ID[0])
counter = 1
outPath = baseDirectory + 'cleanedBib.csv'

with open(outPath, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite','CitationKey'])

for key in bib_data.entries.keys():
    print(key)
    try:
        author = bib_data.entries[key].persons['author']
        FA = author[0].rich_first_names
        print(str(FA)[7:-3])
        LA = author[-1].rich_first_names
        print(str(LA)[7:-3])
    except:
        author = bib_data.entries[key].persons['editor']
        FA = author[0].rich_first_names
        print(str(FA)[7:-3])
        LA = author[-1].rich_first_names
        print(str(LA)[7:-3])

    if (yourFirstAuthor!='LastName, FirstName') and (yourLastAuthor!='LastName, FirstName'):
        selfCiteCheck1 = [s for s in author if
         yourLastAuthor in str([str(s.rich_last_names)[7:-3], str(s.rich_first_names)[7:-3]]).replace("'", "")]

        selfCiteCheck2 = [s for s in author if
         yourFirstAuthor in str([str(s.rich_last_names)[7:-3], str(s.rich_first_names)[7:-3]]).replace("'", "")]

        if (len(selfCiteCheck1)>0) or (len(selfCiteCheck2)>0):
            selfCite = 'Y'
        else:
            selfCite= 'N'
    else:
        selfCite = 'NA'

    with open(outPath, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([counter, str(FA)[7:-3], str(LA)[7:-3], bib_data.entries[key].fields['title'].replace(',', '')
, selfCite,key])
    counter += 1