In [None]:
import pandas as pd
from pyzotero.zotero import Zotero
from dotenv import load_dotenv
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.utils import *

# Load environment variables from .env file
load_dotenv()

ZOTERO_ID = os.getenv("ZOTERO_ID")
VAULT_PATH = os.getenv("VAULT_PATH")
REFERENCES = os.getenv("REFERENCES")
REFERENCES = "3. Literature"

zot = Zotero(ZOTERO_ID, "user", "Meine Biblliothek", local=True)

In [2]:
items = zot.everything(zot.items())

In [3]:
df_items = pd.DataFrame(items)
df_items = pd.json_normalize(df_items.to_dict(orient="records"))

In [4]:
# write items to parquet file
df_items.to_parquet(
    r"E:\PycharmProjects\pyobsidian\data/zotero_items.parquet",
    index=False,
    engine="pyarrow",
)

In [None]:
# apply to dataframe
df_items["data.citekey"] = df_items.apply(create_citekey, axis=1)

df_items["data.year"] = df_items.apply(create_year, axis=1)

# create the zotero_link
df_items["data.zotero_link"] = df_items.apply(create_zotero_link, axis=1)

  return int(pd.to_datetime(row['data.date'], errors='coerce').year)


# Read Obsidian Data

In [6]:
# read all MD files in VAULT_PATH + REFERENCES
md_files = [
    f for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES)) if f.endswith(".md")
]
print(f"Found {len(md_files)} markdown files in {os.path.join(VAULT_PATH, REFERENCES)}")

# list all pdfs
pdf_files = [
    f
    for f in os.listdir(os.path.join(VAULT_PATH, REFERENCES, "pdfs"))
    if f.endswith(".pdf")
]
print(
    f"Found {len(pdf_files)} pdf files in {os.path.join(VAULT_PATH, REFERENCES, 'pdfs')}"
)

Found 74 markdown files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature
Found 52 pdf files in C:\Users\Admin\Insync\d.h.jaggi@gmail.com\Google Drive\projects\4. Work\Research\3. Literature\pdfs


In [7]:
df_items

Unnamed: 0,key,version,library.type,library.id,library.name,library.links.self.href,library.links.self.type,library.links.alternate.href,library.links.alternate.type,links.self.href,...,data.patentNumber,data.filingDate,data.applicationNumber,data.issueDate,data.seriesTitle,data.path,data.meetingName,data.citekey,data.year,data.zotero_link
0,LTACJM46,33885,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@garelInvestorsCareBiodiversity2024,2024.0,zotero://select/items/LTACJM46
1,FIA3YIQV,33856,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/FIA3YIQV
2,BYJC8MGQ,33853,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/BYJC8MGQ
3,GYGPC7JX,33852,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@dassTruncationBiasCorrections2017,2017.0,zotero://select/items/GYGPC7JX
4,ZWHAMDFW,33848,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,@amorosoMAsInnovationNew,,zotero://select/items/ZWHAMDFW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5245,65R24QLP,50,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/65R24QLP
5246,YY8SFYV6,48,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/YY8SFYV6
5247,GEJFKW4T,1046,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/GEJFKW4T
5248,LRZQG68T,1335,user,6845452,My Library,http://localhost:23119/api/users/6845452,application/json,https://www.zotero.org/users/6845452,text/html,http://localhost:23119/api/users/6845452/items...,...,,,,,,,,,,zotero://select/items/LRZQG68T


In [10]:
# Run the function
process_markdown_files(md_files, df_items, pdf_files, VAULT_PATH, REFERENCES, zot)

Processing file: @abiyounesReplicablePatentIndicators2024.md
Processing file: @aboodAutomatedPatentLandscaping2018.md
Processing file: @araciFinBERTFinancialSentiment2019.md
Processing file: @artsNaturalLanguageProcessing2021.md
Processing file: @artsPositionDifferentiationFirms2023.md
Processing file: @artsTextMatchingMeasure2018.md
Processing file: @baucklohDoesItPay2022.md
Processing file: @bekamiriHybridModelPatent2021.md
Processing file: @bellstamTextBasedAnalysisCorporate2019.md
Processing file: @benaCorporateInnovationsMergers2014.md
Processing file: @benaPatentIntensityFirm2022.md
Processing file: @bhojrajWhatMyLine2003.md
Processing file: @binglerCheapTalkCherrypicking2022.md
Processing file: @bleiLatentDirichletAllocation2003.md
Processing file: @bronzettiDisclosureInnovationSustainability2021.md
Processing file: @chenLeveragingGoogleBERT2021.md
Processing file: @chinloyInvestmentDepreciationObsolescence2020.md
Processing file: @choiDeepLearningPatent2022.md
Processing file: 