In [None]:
# default_exp core

# send_to_pb

> use `send_to_pb <TARGET>` from your terminal

In [None]:
#export

from urllib.parse import urlparse 
import pathlib

from bs4 import BeautifulSoup
from tqdm import tqdm
import fastcore.test
import fastcore.utils
import requests

from send_to_pb import utils as u

In [None]:
#export
def get_arxiv_pdf_url(url) -> (str, str):
    url = url.strip('/')
    if urlparse(url).path.startswith("/pdf/"):
        u.logger.debug("Direct pdf link to ARXIV detected, replatincg to /abs/")
        url = url.replace("/pdf/", "/abs/")

    if urlparse(url).path.startswith("/abs/"):
        u.logger.debug(f"Getting ARXIV link to parse {url}")
        content = fastcore.utils.urlread(url)
        u.logger.debug("OK")
        soup = BeautifulSoup(content, 'html.parser')
        u.logger.debug(f"Parsing {url}")

        title = soup.find('title').contents[0]
        u.logger.debug(f"Title found: {title}")

        href = soup.find("a", class_="abs-button download-pdf").get('href')
        u.logger.debug(f"href to pdf found: {href}")

        scheme, netloc =  urlparse(url).scheme, urlparse(url).netloc
        url_pdf = f"{scheme}://{netloc}{href}"
        return url_pdf, f"{title}.pdf"
    else:
        u.logger.error(f"This line should never run, check your url: {url}")
        raise Exception("This line should never run, check your url")

In [None]:
#export
def download_file(url:str, dest:pathlib.Path):
    u.logger.debug(f"downloading to {dest}")
    r = requests.get(url, stream=True, allow_redirects=True)
    total_size = int(r.headers.get('content-length'))
    initial_pos = 0
    with open(dest, 'wb') as f: 
        with tqdm(total=total_size,
                  unit='iB',
                  unit_scale=True,
                  desc=dest.name,
                  initial=initial_pos,
                  ) as pbar:
            for ch in r.iter_content(chunk_size=1024):
                if ch:
                    f.write(ch) 
                    pbar.update(len(ch))

In [None]:
#export
def is_url(s):
    result = urlparse(s)
    if result.scheme:
        return True
    else:
        return False

In [None]:
#export

from enum import Enum

class TargetType(Enum):
    LOCAL_FILE = 'LOCAL_FILE'
    LOCAL_DIR = 'LOCAL_DIR'
    URL_ARXIV = 'URL_ARXIV'
    URL_OTHER = 'URL_OTHER'
    NOT_FOUND = 'NOT_FOUND'


class Target:
    def __init__(self, input_str:str):
        self.input = input_str
        self.type = self.get_type(input_str)
        
    def get_type(self, s:str) -> TargetType:
        if is_url(s):
            if urlparse(s).netloc == 'arxiv.org':
                return TargetType.URL_ARXIV
            else:
                return TargetType.URL_OTHER

        else:
            if pathlib.Path(s).is_file():
                return TargetType.LOCAL_FILE
            elif pathlib.Path(s).is_dir():
                return TargetType.LOCAL_DIR
            else:
                u.logger.warning(f"Can't find {s}")
                return TargetType.NOT_FOUND
    
    def fetch_target(self):
        if self.type == TargetType.URL_ARXIV:
            url_pdf, title = get_arxiv_pdf_url(self.input)

            dest_dir = pathlib.Path("~/Downloads/send_to_pb").expanduser()
            if not dest_dir.exists():
                u.logger.info(f"Creating target directory: {dest_dir}")
                dest_dir.mkdir(exist_ok=True)

            dest = dest_dir/title
            if dest.is_file():
                # file exists already
                u.logger.info(f"File already exists, skipping download {dest}")
            else:
                # file does not exist, let's download it
                download_file(url_pdf, dest)

            self.file_path = dest
            return self.file_path
        
        elif self.type == TargetType.LOCAL_FILE:
            self.file_path = pathlib.Path(self.input)
            u.logger.info(f"Will send a local file from {self.file_path}")
            return self.file_path
            
        else:
            raise NotImplementedError(f"{self.type.name} targets")
        
    __repr__ = fastcore.utils.basic_repr('input, type')
    

In [None]:
input_str = 'https://arxiv.org/abs/2010.05365'
t = Target(input_str)

In [None]:
def test_func(f, args, want):
    if isinstance(args, (list, tuple)):
        fastcore.test.test_eq(f(*args), want)
    else:
        fastcore.test.test_eq(f(args), want)

In [None]:
test_func(is_url, "actually not a url", False)
test_func(is_url, "test_folder", False)
test_func(is_url, "test_folder/test_file.txt", False)
test_func(is_url, "http://google.com", True)
test_func(is_url, "https://arxiv.org/abs/2010.05365", True)

In [None]:
test_func(t.get_type, "http://google.com", TargetType.URL_OTHER)
test_func(t.get_type, "https://arxiv.org/abs/2010.05365", TargetType.URL_ARXIV)
test_func(t.get_type, "test_folder", TargetType.LOCAL_DIR)
test_func(t.get_type, "test_folder/test_file.txt", TargetType.LOCAL_FILE)
test_func(t.get_type, "test_folder/this_file_does_not_exist.txt", TargetType.NOT_FOUND)
test_func(t.get_type, "not a file at all", TargetType.NOT_FOUND)



AssertionError: ==:
TargetType.NOT_FOUND
TargetType.LOCAL_DIR

In [None]:
# integration
# Actually test a download

input_str = 'https://arxiv.org/abs/2010.05365'
t = Target(input_str)
fetched_path = t.fetch_target()
fetched_path

2020-10-16 15:38:58.954 | DEBUG    | __main__:get_arxiv_pdf_url:9 - Getting ARXIV link to parse https://arxiv.org/abs/2010.05365
2020-10-16 15:39:00.398 | DEBUG    | __main__:get_arxiv_pdf_url:11 - OK
2020-10-16 15:39:00.449 | DEBUG    | __main__:get_arxiv_pdf_url:13 - Parsing https://arxiv.org/abs/2010.05365
2020-10-16 15:39:00.449 | DEBUG    | __main__:get_arxiv_pdf_url:16 - Title found: [2010.05365] ArXiving Before Submission Helps Everyone
2020-10-16 15:39:00.450 | DEBUG    | __main__:get_arxiv_pdf_url:19 - href to pdf found: /pdf/2010.05365
2020-10-16 15:39:00.458 | DEBUG    | __main__:download_file:3 - downloading to /home/david/Downloads/send_to_pb/[2010.05365] ArXiving Before Submission Helps Everyone.pdf
[2010.05365] ArXiving Before Submission Helps Everyone.pdf: 100%|██████████| 266k/266k [00:02<00:00, 107kiB/s]  


Path('/home/david/Downloads/send_to_pb/[2010.05365] ArXiving Before Submission Helps Everyone.pdf')

In [None]:
# integration
# Actually test a download

input_str = 'https://arxiv.org/pdf/2010.05365'
Target(input_str).fetch_target()


2020-10-16 15:39:04.468 | DEBUG    | __main__:get_arxiv_pdf_url:5 - Direct pdf link to ARXIV detected, replatincg to /abs/
2020-10-16 15:39:04.469 | DEBUG    | __main__:get_arxiv_pdf_url:9 - Getting ARXIV link to parse https://arxiv.org/abs/2010.05365
2020-10-16 15:39:09.642 | DEBUG    | __main__:get_arxiv_pdf_url:11 - OK
2020-10-16 15:39:09.689 | DEBUG    | __main__:get_arxiv_pdf_url:13 - Parsing https://arxiv.org/abs/2010.05365
2020-10-16 15:39:09.689 | DEBUG    | __main__:get_arxiv_pdf_url:16 - Title found: [2010.05365] ArXiving Before Submission Helps Everyone
2020-10-16 15:39:09.690 | DEBUG    | __main__:get_arxiv_pdf_url:19 - href to pdf found: /pdf/2010.05365
2020-10-16 15:39:09.691 | DEBUG    | __main__:download_file:3 - downloading to /home/david/Downloads/send_to_pb/[2010.05365] ArXiving Before Submission Helps Everyone.pdf
[2010.05365] ArXiving Before Submission Helps Everyone.pdf: 100%|██████████| 266k/266k [00:03<00:00, 75.8kiB/s] 


Path('/home/david/Downloads/send_to_pb/[2010.05365] ArXiving Before Submission Helps Everyone.pdf')