In [17]:
import pandas as pd
from pyzotero.zotero import Zotero
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

ZOTERO_ID = os.getenv("ZOTERO_ID")
VAULT_PATH = os.getenv("VAULT_PATH")
REFERENCES = os.getenv("REFERENCES")
REFERENCES = "3. Literature"

zot = Zotero(ZOTERO_ID, "user", "Meine Biblliothek", local=True)

In [3]:
items = zot.everything(zot.items())

In [45]:
reference_dictionary = {}
for item in items:
    if item['data']['itemType'] != 'attachment|note':
        try:
            # Attempt to access the 'title' field in the 'data' dictionary
            title = item['data']['title']
            key = item['key']
            # everything between "Citation Key: " and the next newline
            if "extra" in item["data"]:
                # Extract the citation key from the 'extra' field
                if "Citation Key: " in item["data"]["extra"]:
                    citekey = f"@{item['data']['extra'].split('Citation Key: ')[1].split('\\n')[0]}"
            paper_dict = {
                "title": title,
                "key": key,
                "citekey": citekey
            }
            reference_dictionary[key] = paper_dict
        except KeyError:
            pass

In [46]:
items[1]

{'key': 'EUZZYZFS',
 'version': 32991,
 'library': {'type': 'user',
  'id': 6845452,
  'name': 'Meine Bibliothek',
  'links': {'self': {'href': 'http://localhost:23119/api/users/6845452',
    'type': 'application/json'},
   'alternate': {'href': 'https://www.zotero.org/users/6845452',
    'type': 'text/html'}}},
 'links': {'self': {'href': 'http://localhost:23119/api/users/6845452/items/EUZZYZFS',
   'type': 'application/json'},
  'alternate': {'href': 'https://www.zotero.org/users/6845452/items/EUZZYZFS',
   'type': 'text/html'},
  'up': {'href': 'http://localhost:23119/api/users/6845452/items/RWSZMPXZ',
   'type': 'application/json'},
  'enclosure': {'href': 'file:///C:/Users/Admin/Zotero/storage/EUZZYZFS/Cheng%20et%20al.%20-%202025%20-%20Green%20innovation%20and%20firms%E2%80%99%20financial%20and%20environmental%20performance%20the%20roles%20of%20pollution%20preventi.pdf',
   'type': 'application/pdf',
   'title': 'Cheng et al. - 2025 - Green innovation and firms’ financial and envi

In [47]:
itemTypes

{'attachment',
 'blogPost',
 'book',
 'bookSection',
 'computerProgram',
 'conferencePaper',
 'dataset',
 'document',
 'journalArticle',
 'magazineArticle',
 'manuscript',
 'newspaperArticle',
 'note',
 'patent',
 'preprint',
 'presentation',
 'report',
 'thesis',
 'webpage'}

In [49]:
df_citation = pd.DataFrame.from_dict(reference_dictionary, orient='index')
# create a zotero link with the key zotero://select/library/items/YIYQNCBC
df_citation['zotero_link'] = df_citation['key'].apply(lambda x: f"zotero://select/library/items/{x}")
df_citation

Unnamed: 0,title,key,citekey,zotero_link
RWSZMPXZ,Green innovation and firms’ financial and envi...,RWSZMPXZ,@chengGreenInnovationFirms2025,zotero://select/library/items/RWSZMPXZ
EUZZYZFS,Full Text PDF,EUZZYZFS,@chengGreenInnovationFirms2025,zotero://select/library/items/EUZZYZFS
GJWBV6ZV,ScienceDirect Snapshot,GJWBV6ZV,@chengGreenInnovationFirms2025,zotero://select/library/items/GJWBV6ZV
F5YBAX47,The impact of green bond issuance on corporate...,F5YBAX47,@liImpactGreenBond2025,zotero://select/library/items/F5YBAX47
BE62I7EJ,PDF,BE62I7EJ,@liImpactGreenBond2025,zotero://select/library/items/BE62I7EJ
...,...,...,...,...
VNIFGX4C,NIPS Snapshot,VNIFGX4C,@abadComprehensiveReviewValue2014,zotero://select/library/items/VNIFGX4C
9SXXLJT9,arXiv.org Snapshot,9SXXLJT9,@abadComprehensiveReviewValue2014,zotero://select/library/items/9SXXLJT9
YY8SFYV6,Snapshot,YY8SFYV6,@abadComprehensiveReviewValue2014,zotero://select/library/items/YY8SFYV6
GEJFKW4T,arXiv.org Snapshot,GEJFKW4T,@abadComprehensiveReviewValue2014,zotero://select/library/items/GEJFKW4T


# Read Obsidian Data

In [50]:
# read all MD files in VAULT_PATH + REFERENCES
md_files = [f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES)) if f.endswith('.md')]
print(f"Found {len(md_files)} markdown files in {os.path.join(VAULT_PATH, REFERENCES)}")

Found 6 markdown files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature


In [51]:
# read all the markdown files and keep only the ones which do not start with ---
md_files = [f for f in md_files if not open(os.path.join(VAULT_PATH, REFERENCES, f)).readline().startswith('---')]
print(f"Filtered to {len(md_files)} markdown files that do not start with ---")

Filtered to 1 markdown files that do not start with ---


In [53]:
# add a yaml header with 
# ---
# title: 
# citekey:
# zotero:
# tags:
#   - paper
# ---
for md_file in md_files:
    print(f"Processing file: {md_file}")
    file_path = os.path.join(VAULT_PATH, REFERENCES, md_file)
    # only select the files which do not have a yaml header
    if open(file_path, 'r').readline().startswith('---'):
        print(f"Skipping file {md_file} as it already has a YAML header.")
        continue
    
    # Extract the title from the filename (without .md)
    note_title = md_file[:-3]
    
    # Find the corresponding citation key and zotero link
    citekey = df_citation[df_citation['citekey'] == note_title]['citekey'].values
    zotero_link = df_citation[df_citation['citekey'] == note_title]['zotero_link'].values
    title = df_citation[df_citation['citekey'] == note_title]['title'].values
    # title as string
    title = title[0]
    
    if citekey.size > 0 and zotero_link.size > 0:
        citekey = citekey[0]
        zotero_link = zotero_link[0]
        
        # Create the YAML header
        yaml_header = f'---\ntitle: "{title}"\ncitekey: "{citekey}"\nzotero: {zotero_link}\ntags:\n  - paper\n---\n'
        
        # prepend the YAML header to the file
        with open(file_path, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(yaml_header + content)

Processing file: @kileUsingIndustryClassification2009.md
