In [1]:
import pandas as pd
from pyzotero.zotero import Zotero
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

ZOTERO_ID = os.getenv("ZOTERO_ID")
VAULT_PATH = os.getenv("VAULT_PATH")
REFERENCES = os.getenv("REFERENCES")
REFERENCES = "3. Literature"

zot = Zotero(ZOTERO_ID, "user", "Meine Biblliothek", local=True)

In [2]:
items = zot.everything(zot.items())

In [3]:
df_items = pd.DataFrame(items)
df_items = pd.json_normalize(df_items.to_dict(orient='records'))

In [8]:
# write items to parquet file
df_items.to_parquet(
    "E:\PycharmProjects\pyobsidian\data/zotero_items.parquet",
    index=False,
    engine='pyarrow',
)

  "E:\PycharmProjects\pyobsidian\data/zotero_items.parquet",


In [9]:
# create column data.citekey
def create_citekey(row):
    if row['data.extra'] not in [None, ""]:
        try:
            return f"@{row["data.extra"].split('Citation Key: ')[1].split('\\n')[0]}"
        except:
            pass
    else:
        pass
# apply to dataframe
df_items['data.citekey'] = df_items.apply(create_citekey, axis=1)

def create_year(row):
    """
    Handle the year extraction from the date field.
    The following formats are expected:
    - 2009-01-01
    - 2025/02
    - 2024
    - 03/2018
    - 10-Aug-2020
    - August 30, 2018
    """
    if row['data.date'] not in [None, ""]:
        try:
            # Attempt to parse the date
            return int(pd.to_datetime(row['data.date'], errors='coerce').year)
        except ValueError:
            # If parsing fails, return None
            return None
        
df_items['data.year'] = df_items.apply(create_year, axis=1)

# create the zotero_link
def create_zotero_link(row):
    return f"zotero://select/items/{row['key']}"

# apply to dataframe
df_items['data.zotero_link'] = df_items.apply(create_zotero_link, axis=1)

  return int(pd.to_datetime(row['data.date'], errors='coerce').year)


# Read Obsidian Data

In [10]:
# read all MD files in VAULT_PATH + REFERENCES
md_files = [f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES)) if f.endswith('.md')]
print(f"Found {len(md_files)} markdown files in {os.path.join(VAULT_PATH, REFERENCES)}")

# list all pdfs
pdf_files = [f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES, "pdfs")) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} pdf files in {os.path.join(VAULT_PATH, REFERENCES, 'pdfs')}")

Found 41 markdown files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature
Found 23 pdf files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature\pdfs


In [11]:
df_items

Unnamed: 0,key,version,library.type,library.id,library.name,library.links.self.href,library.links.self.type,library.links.alternate.href,library.links.alternate.type,links.self.href,...,data.patentNumber,data.filingDate,data.applicationNumber,data.issueDate,data.seriesTitle,data.path,data.meetingName,data.citekey,data.year,data.zotero_link
0,PCTSIRM8,33496,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@papenbrockAssetClustersAsset2011,2011.0,zotero://select/items/PCTSIRM8
1,LWNR3XN9,33492,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@raffinotHierarchicalClusteringBased2017,2017.0,zotero://select/items/LWNR3XN9
2,H35Q4HDS,33478,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/H35Q4HDS
3,UJ7ZINHY,33449,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/UJ7ZINHY
4,IR5SUJAX,33447,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@chinloyInvestmentDepreciationObsolescence2020,2020.0,zotero://select/items/IR5SUJAX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5221,65R24QLP,50,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/65R24QLP
5222,YY8SFYV6,48,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/YY8SFYV6
5223,GEJFKW4T,1046,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/GEJFKW4T
5224,LRZQG68T,1335,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/LRZQG68T


In [12]:
def process_markdown_files():
    """Process markdown files to add YAML headers and download PDFs from Zotero."""
    
    for md_file in md_files:
        print(f"Processing file: {md_file}")
        file_path = os.path.join(VAULT_PATH, REFERENCES, md_file)
        note_title = md_file[:-3]  # Remove .md extension
        
        # Check if file already has YAML header
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            
        if first_line.startswith('---'):
            print(f"File {md_file} already has YAML header. Skipping.")
            continue
            
        # Find matching item in Zotero data
        matching_items = df_items[df_items['data.citekey'] == note_title]
        if matching_items.empty:
            print(f"Warning: No matching item found for {note_title}. Skipping file.")
            continue
            
        item = matching_items.iloc[0]
        
        try:
            # Extract metadata
            citekey = item['data.citekey']
            zotero_link = item['data.zotero_link']
            title = item['data.title']
            year = item['data.year']
            abstract = item['data.abstractNote'] if 'data.abstractNote' in item else ''
            creator = item['meta.creatorSummary']
            
            # Handle year formatting
            if pd.notna(year) and isinstance(year, float):
                year = int(year)
            
            # Create aliases
            creator_year = f"{creator} ({year})"
            creator_year_alt = f"{creator}, ({year})"
            
            # Create YAML header
            yaml_header = f'''---
title: "{title}"
citekey: "{citekey}"
zotero: {zotero_link}
abstract: "{abstract}"
aliases: 
  - {creator_year}
  - {creator_year_alt}
  - "{title}"
tags:
  - paper
---
'''
            
            # Add YAML header to file
            with open(file_path, 'r+', encoding='utf-8') as f:
                content = f.read()
                f.seek(0, 0)
                f.write(yaml_header + content)
                
            print(f"Added YAML header to {md_file}")
            
        except KeyError as e:
            print(f"Warning: Missing required field {e} for {note_title}. Skipping file.")
            continue
            
        # Check and download PDF if needed
        pdf_file = f"{note_title}.pdf"
        if pdf_file not in pdf_files:
            try:
                parent_key = item['key']
                pdf_attachments = df_items[
                    (df_items["data.parentItem"] == parent_key) & 
                    (df_items['links.enclosure.type'] == 'application/pdf')
                ]
                
                if not pdf_attachments.empty:
                    pdf_key = pdf_attachments.iloc[0]['key']
                    pdf_path = os.path.join(VAULT_PATH, REFERENCES, "pdfs")
                    zot.dump(pdf_key, pdf_file, pdf_path)
                    print(f"Downloaded PDF for {note_title}")
                else:
                    print(f"Warning: No PDF attachment found for {note_title} in Zotero.")
                    
            except Exception as e:
                print(f"Error downloading PDF for {note_title}: {e}")

# Run the function
process_markdown_files()

Processing file: @araciFinBERTFinancialSentiment2019.md
File @araciFinBERTFinancialSentiment2019.md already has YAML header. Skipping.
Processing file: @bellstamTextBasedAnalysisCorporate2019.md
Added YAML header to @bellstamTextBasedAnalysisCorporate2019.md
Downloaded PDF for @bellstamTextBasedAnalysisCorporate2019
Processing file: @benaCorporateInnovationsMergers2014.md
File @benaCorporateInnovationsMergers2014.md already has YAML header. Skipping.
Processing file: @benaPatentIntensityFirm2022.md
Added YAML header to @benaPatentIntensityFirm2022.md
Downloaded PDF for @benaPatentIntensityFirm2022
Processing file: @bhojrajWhatMyLine2003.md
File @bhojrajWhatMyLine2003.md already has YAML header. Skipping.
Processing file: @binglerCheapTalkCherrypicking2022.md
File @binglerCheapTalkCherrypicking2022.md already has YAML header. Skipping.
Processing file: @bleiLatentDirichletAllocation2003.md
File @bleiLatentDirichletAllocation2003.md already has YAML header. Skipping.
Processing file: @br