In [197]:
from typing import Optional, Any
import json
import arxiv
import shutil
import requests
import numpy as np
import pandas as pd
from pypdf import PdfReader
from pathlib import Path
from crossref.restful import Works

import logging
logger = logging.getLogger("pypdf")
logger.setLevel(logging.ERROR)


In [152]:
root = Path.home() / "metacognition" / "resources"
papers = root / "papers"
books = root / "books"

In [None]:
client = arxiv.Client()

def get_metadata(filename: str):
    try:
        # f"https://export.arxiv.org/api/query?id_list={filename.split('_')[-1].removesuffix('.pdf')}&max_results=1"
        return next(client.results(arxiv.Search(id_list = [filename.split('_')[-1].removesuffix('.pdf')])))
    except Exception:
        return None 

df = (
    pd.DataFrame({"path": papers.rglob("*.pdf")})
    .assign(filename = lambda df: df["path"].apply(lambda x: x.name))
    .assign(doi = lambda df: df["path"].apply(lambda x: x.stem.replace('_', '/').lower()))
    .loc[lambda df: df["doi"].str.startswith("10.4855")] # keep only arxiv papers
    .assign(metadata = lambda df: df["filename"].apply(get_metadata))
    .dropna()
    .assign(title = lambda df: df["metadata"].apply(lambda x: x.title))
    .assign(authors = lambda df: df["metadata"].apply(lambda x: ', '.join(str(a) for a in x.authors)))
)
df

In [242]:
index_df = pd.read_table(papers/"index.md", sep = "|", skiprows=3, usecols = [1,2,3], header = None)
index_df.columns = ["filename", "title", "authors"]
index_df["filename"] = index_df["filename"].apply(lambda x: (x + ".pdf").replace(' ', ''))
index_df = pd.concat([index_df, df[["filename", "title", "authors"]]], axis=0)
index_df["tags"] = None 
index_df.to_markdown(root / "papers_index.md", index=False)

In [196]:
df[["name", "metadata"]].to_csv("doi.csv", index = T)

In [None]:
works = Works()

def crossref_search(doi: str) -> Optional[dict[str, Any]]:
    try:
        return works.doi(doi)
    except Exception:
        return None

def get_title(crossref_doi: dict) -> Optional[str]:
    try:
        return crossref_doi["title"][0]
    except IndexError:
        return None

def get_authors(crossref_doi: dict) -> str:
    try:
        first_author = list()
        authors = list()
        for author in crossref_doi["author"]:
            name = f"{author["given"]} {author["family"]}"
            if author["sequence"] == "first":
                first_author.append(name)
            else:
                authors.append(name)
        return ', '.join(first_author + authors)
    except KeyError:
        return None

df = (
    pd.DataFrame({"path": papers.rglob("*.pdf")})
    .assign(doi = lambda df: df["path"].apply(lambda x: x.stem.replace('_', '/').lower()))
    .loc[lambda df: df["doi"].apply(lambda x: not x.startswith("10.4855"))] # remove arxiv papers
    .assign(crossref = lambda df: df["doi"].apply(crossref_search))
)

df[["doi", "crossref"]].to_csv("library.csv")

(
    df
    .dropna()
    .assign(title = lambda df: df["crossref"].apply(get_title))
    .assign(authors = lambda df: df["crossref"].apply(get_authors))
    .assign(filename = lambda df: df["doi"].str.replace('/', '_'))
    .dropna()
)[["filename", "title", "authors"]].to_markdown(root/'index.md', index=False, tablefmt = "github")

In [189]:
def get_title(crossref_doi: dict) -> Optional[str]:
    try:
        return crossref_doi["title"][0]
    except IndexError:
        return None

def get_authors(crossref_doi: dict) -> str:
    try:
        first_author = list()
        authors = list()
        for author in crossref_doi["author"]:
            name = f"{author["given"]} {author["family"]}"
            if author["sequence"] == "first":
                first_author.append(name)
            else:
                authors.append(name)
        return ', '.join(first_author + authors)
    except KeyError:
        return None

(
    df
    .dropna()
    .assign(title = lambda df: df["crossref"].apply(get_title))
    .assign(authors = lambda df: df["crossref"].apply(get_authors))
    .assign(filename = lambda df: df["doi"].str.replace('/', '_'))
    .dropna()
)[["filename", "title", "authors"]].to_markdown(papers/'index.md', index=False, tablefmt = "github")

In [126]:
def get_title_and_author(pdf: Path):
    try:
        metadata =  PdfReader(pdf).metadata
        return metadata.title, metadata.author
    except Exception:
        return np.nan

def request_isbn(title, author):
    search_url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{title}inauthor:{author}"
    search_ret = requests.get(search_url)
    if search_ret.status_code != 200:
        return None 
    try:
        book_url = json.loads(search_ret.content)["items"][0]["selfLink"]
    except KeyError:
        return None
    book_ret = requests.get(book_url)
    if book_ret.status_code != 200:
        return None
    book_data = json.loads(book_ret.content)
    try:
        return book_data["volumeInfo"]["industryIdentifiers"][-1]["identifier"]
    except KeyError:
        return book_data["volumeInfo"]

df = (
    pd.DataFrame({"name": books.rglob("*.pdf")})
    .assign(old_name = lambda df: df["name"].apply(lambda x: x.name))
    .assign(metadata = lambda df: df["name"].apply(get_title_and_author))
    .loc[lambda df: df["metadata"].apply(lambda x: None not in x)]
    .reset_index(drop = True)
    .drop([9, 10, 14, 26, 35, 41, 44], axis = 0)
    .reset_index(drop = True)
    .assign(isbn = lambda df: df.apply(lambda x: request_isbn(x["metadata"][0], x["metadata"][1]), axis = 1))
)
df[["old_name", "isbn"]].to_csv("library.csv")

In [122]:
title = "The C Programming Language (Second Edition)"
author = "Brian W. Kernighan"
search_url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{title}inauthor:{author}"


In [135]:
df = pd.read_csv("library.csv", index_col = 0)
for _, row in df.iterrows():
    shutil.move(books / row["old_name"], books / f"{row["isbn"]}.pdf")

In [67]:
for _, row in df.iterrows():
    text = PdfReader(row["name"]).pages[2].extract_text()
    loc = text.find("Digital Object Identifier")
    shutil.move(row["name"], papers / f"{text.split('\n')[1][26:].replace(' ', '')[:25].lower().replace('/', '_')}.pdf")
    #print(row["old_name"], doi)

In [None]:
def get_subject(pdf: Path) -> str:
    try:
        subject_str = PdfReader(pdf).metadata.subject_raw
        return subject_str if subject_str != '' else np.nan
    except Exception:
        return np.nan 
    
def get_metadata(pdf: Path):
    try:
        return PdfReader(pdf).metadata
    except Exception:
        return np.nan

def get_doi(subject: str) -> str:
    idx = subject.find("doi.org/")
    if idx != -1:
        return subject[idx+8:] 

    idx = subject.find("doi:")
    if idx != -1:
        return subject[idx+4:] 

    doi = subject.split(';')[-1]
    if doi is not None:
        try:
            if doi[2] == '.' and doi[7] == '/':
                return doi
        except Exception:
            return None

def get_crossref_metadata(meta):
    try:
        return next(iter(works.query(f"{meta.replace("_", " ")}")))
    except Exception:
        return np.nan

works = Works()

papers_df = (
    pd.DataFrame({"path": papers.rglob("*.pdf")})
    .assign(old_name = lambda df: df["path"].apply(lambda x: x.name))
    .loc[lambda df: df["old_name"].apply(lambda x: not x.startswith("10"))]
    .assign(metadata = lambda df: df["path"].apply(get_metadata))
    #.dropna()
    .assign(crossref = lambda df: df["old_name"].apply(get_crossref_metadata))
    .reset_index(drop=True)
)
#papers_df[["old_name", "subject", "doi"]].to_csv("papers.csv")
display(papers_df)
papers_df.to_csv("papers.csv")

In [None]:
d = {"b": 100, "c": 10, "a": 55}
{k: d[k] for k in sorted(d) if k != "b"}

In [76]:
(   
    papers_df
    .dropna()
    .assign(crossref = lambda df: df["crossref"].apply(lambda x: {k: x[k] for k in sorted(x) if k != "reference"}))
    .assign(doi = lambda df: df["crossref"].apply(lambda x:  x["DOI"]))
    .assign(url = lambda df: df["crossref"].apply(lambda x:  x["URL"]))
    .sort_values("old_name")
    .reset_index(drop = True)
)[["old_name", "doi", "url"]].to_csv("papers.csv")
#papers_df["crossref"].apply(lambda x: x["DOI"].replace('/', '_'))

In [None]:
def is_arxiv(name: str) -> bool:
    first = name.split('.')[0]
    try:
        return True if len(first) == 4 and first.isdigit() else False
    except Exception:
        return False
    

papers_df = (
    pd.DataFrame({"path": papers.rglob("*.pdf")})
    .assign(old_name = lambda df: df["path"].apply(lambda x: x.name))
    .loc[lambda df: df["path"].apply(lambda x: is_arxiv(x.name))]
    .reset_index(drop = True)
)
papers_df

In [26]:
for _, row in papers_df.iterrows():
    shutil.move(row["path"], row["path"].parent / f"10.48550_{row["path"].name}")