In [60]:
import pandas as pd
from pyzotero.zotero import Zotero
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

ZOTERO_ID = os.getenv("ZOTERO_ID")
VAULT_PATH = os.getenv("VAULT_PATH")
REFERENCES = os.getenv("REFERENCES")
REFERENCES = "3. Literature"

zot = Zotero(ZOTERO_ID, "user", "Meine Biblliothek", local=True)

In [61]:
items = zot.everything(zot.items())

In [62]:
df_items = pd.DataFrame(items)
df_items = pd.json_normalize(df_items.to_dict(orient='records'))

In [63]:
# create column data.citekey
def create_citekey(row):
    if row['data.extra'] not in [None, ""]:
        try:
            return f"@{row["data.extra"].split('Citation Key: ')[1].split('\\n')[0]}"
        except:
            pass
    else:
        pass
# apply to dataframe
df_items['data.citekey'] = df_items.apply(create_citekey, axis=1)

def create_year(row):
    """
    Handle the year extraction from the date field.
    The following formats are expected:
    - 2009-01-01
    - 2025/02
    - 2024
    - 03/2018
    - 10-Aug-2020
    - August 30, 2018
    """
    if row['data.date'] not in [None, ""]:
        try:
            # Attempt to parse the date
            return int(pd.to_datetime(row['data.date'], errors='coerce').year)
        except ValueError:
            # If parsing fails, return None
            return None
        
df_items['data.year'] = df_items.apply(create_year, axis=1)


  return int(pd.to_datetime(row['data.date'], errors='coerce').year)


In [64]:
# create the zotero_link
def create_zotero_link(row):
    return f"zotero://select/items/{row['key']}"

# apply to dataframe
df_items['data.zotero_link'] = df_items.apply(create_zotero_link, axis=1)

# Read Obsidian Data

In [65]:
# read all MD files in VAULT_PATH + REFERENCES
md_files = [f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES)) if f.endswith('.md')]
print(f"Found {len(md_files)} markdown files in {os.path.join(VAULT_PATH, REFERENCES)}")

Found 12 markdown files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature


In [71]:
# add a yaml header with 
# ---
# title: 
# citekey:
# zotero:
# tags:
#   - paper
# ---

# read all MD files in VAULT_PATH + REFERENCES
md_files = [f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES)) if f.endswith('.md')]
print(f"Found {len(md_files)} markdown files in {os.path.join(VAULT_PATH, REFERENCES)}")

for md_file in md_files:
    print(f"Processing file: {md_file}")
    file_path = os.path.join(VAULT_PATH, REFERENCES, md_file)
    # only select the files which do not have a yaml header
    if open(file_path, 'r').readline().startswith('---'):
        print(f"Skipping file {md_file}.")
        continue
    
    # Extract the title from the filename (without .md)
    note_title = md_file[:-3]
    print(f"Extracted note title: {note_title}")
    
    # Find the corresponding citation key and zotero link only use the first item
    try:
        citekey = df_items[df_items['data.citekey'] == note_title]['data.citekey'].values[0]
    except IndexError:
        print(f"Warning: No matching citekey found for {note_title}. Skipping file.")
        continue
    try:
        zotero_link = df_items[df_items['data.citekey'] == note_title]['data.zotero_link'].values[0]
    except IndexError:
        print(f"Warning: No matching Zotero link found for {note_title}. Skipping file.")
        continue
    try:
        title = df_items[df_items['data.citekey'] == note_title]['data.title'].values[0]
    except IndexError:
        print(f"Warning: No matching title found for {note_title}. Skipping file.")
        continue
    try:
        year = df_items[df_items['data.citekey'] == note_title]['data.year'].values[0]
        # remove .0 if year is a float
        if isinstance(year, float):
            year = int(year)
        creator = df_items[df_items['data.citekey'] == note_title]['meta.creatorSummary'].values[0]
        creator_year = f"{creator} ({year})"
    except IndexError:
        print(f"Warning: No matching year or creator found for {note_title}. Skipping file.")
        continue
        

    # Create the YAML header
    yaml_header = f'---\ntitle: "{title}"\ncitekey: "{citekey}"\nzotero: {zotero_link}\naliases: \n  - {creator_year}\ntags:\n  - paper\n---\n'
    
    # prepend the YAML header to the file
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(yaml_header + content)

Found 12 markdown files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature
Processing file: @bhojrajWhatMyLine2003.md
Skipping file @bhojrajWhatMyLine2003.md.
Processing file: @binglerCheapTalkCherrypicking2022.md
Skipping file @binglerCheapTalkCherrypicking2022.md.
Processing file: @bleiLatentDirichletAllocation2003.md
Extracted note title: @bleiLatentDirichletAllocation2003
Processing file: @cohenESGInnovationDisconnectEvidence2023.md
Skipping file @cohenESGInnovationDisconnectEvidence2023.md.
Processing file: @friederichAutomatedIdentificationClimate2021.md
Skipping file @friederichAutomatedIdentificationClimate2021.md.
Processing file: @hirshleiferInnovativeEfficiencyStock2013.md
Skipping file @hirshleiferInnovativeEfficiencyStock2013.md.
Processing file: @hirshleiferInnovativeOriginalityProfitability2018.md
Skipping file @hirshleiferInnovativeOriginalityProfitability2018.md.
Processing file: @kileUsingIndustryClassification2009.md
Ski

In [72]:
# access the pdfs
df_items

Unnamed: 0,key,version,library.type,library.id,library.name,library.links.self.href,library.links.self.type,library.links.alternate.href,library.links.alternate.type,links.self.href,...,data.patentNumber,data.filingDate,data.applicationNumber,data.issueDate,data.seriesTitle,data.path,data.meetingName,data.citekey,data.year,data.zotero_link
0,KWWQZM8R,33022,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@schlosserGreenInnovationsPatents2024,2024.0,zotero://select/items/KWWQZM8R
1,9JETXWSZ,33017,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/9JETXWSZ
2,66Z432YR,33018,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/66Z432YR
3,WULAH36P,33015,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@maeharaMultistageFinetuningPatent2025,2025.0,zotero://select/items/WULAH36P
4,BW4KBI5R,32994,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/BW4KBI5R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5196,65R24QLP,50,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/65R24QLP
5197,YY8SFYV6,48,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/YY8SFYV6
5198,GEJFKW4T,1046,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/GEJFKW4T
5199,LRZQG68T,1335,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/LRZQG68T


In [85]:
pdfs = df_items[df_items["data.parentItem"] == "E2HVDN2J"]

# filter links.enclosureType == "application/pdf"
pdf_items = pdfs[pdfs['links.enclosure.type'] == 'application/pdf']["key"]
zot.dump(pdf_items.tolist()[0], "@bleiLatentDirichletAllocation2003.pdf", os.path.join(VAULT_PATH, REFERENCES, "pdfs"))

In [79]:
pdf_items

Unnamed: 0,key,version,library.type,library.id,library.name,library.links.self.href,library.links.self.type,library.links.alternate.href,library.links.alternate.type,links.self.href,...,data.patentNumber,data.filingDate,data.applicationNumber,data.issueDate,data.seriesTitle,data.path,data.meetingName,data.citekey,data.year,data.zotero_link
3597,4E4UR6QC,26276,user,6845452,Meine Bibliothek,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/4E4UR6QC


In [74]:
zot.file("E2HVDN2J")

UnsupportedParamsError: 
Code: 400
URL: http://localhost:23119/api/users/6845452/items/E2HVDN2J/file?locale=en-US
Method: GET
Response: Not a file attachment: E2HVDN2J