In [None]:
# default_exp core

# module name here

> API details.

In [None]:
#export

from urllib.parse import urlparse 
import pathlib

from bs4 import BeautifulSoup
from tqdm import tqdm
import fastcore.test
import fastcore.utils
import requests

from send_to_pb import utils as u

In [None]:
#export
def get_arxiv_pdf_url(url) -> (str, str):
    url = url.strip('/')
    if urlparse(url).path.startswith("/pdf/"):
        title = url.split('/')[-1]
        if not title.endswith('.pdf'):
            title += '.pdf'
        return url, title
    elif urlparse(url).path.startswith("/abs/"):
        u.logger.debug(f"Getting ARXIV link to parse {url}")
        content = fastcore.utils.urlread(url)
        u.logger.debug("OK")
        soup = BeautifulSoup(content, 'html.parser')
        u.logger.debug(f"Parsing {url}")

        title = soup.find('title').contents[0]
        u.logger.debug(f"Title found: {title}")

        href = soup.find("a", class_="abs-button download-pdf").get('href')
        u.logger.debug(f"href to pdf found: {href}")

        scheme, netloc =  urlparse(url).scheme, urlparse(url).netloc
        url_pdf = f"{scheme}://{netloc}{href}"
        return url_pdf, f"{title}.pdf"
    else:
        u.logger.error(f"This line should never run, check your url: {url}")
        raise Exception("This line should never run, check your url")

In [None]:
#export
def download_file(url:str, dest:pathlib.Path):
    u.logger.debug(f"downloading to {dest}")
    r = requests.get(url, stream=True, allow_redirects=True)
    total_size = int(r.headers.get('content-length'))
    initial_pos = 0
    with open(dest, 'wb') as f: 
        with tqdm(total=total_size,
                  unit='iB',
                  unit_scale=True,
                  desc=dest.name,
                  initial=initial_pos,
                  ) as pbar:
            for ch in r.iter_content(chunk_size=1024):
                if ch:
                    f.write(ch) 
                    pbar.update(len(ch))

In [None]:
#export
def is_url(s):
    result = urlparse(s)
    if result.scheme:
        return True
    else:
        return False

In [None]:
#export

from enum import Enum

class TargetType(Enum):
    LOCAL_FILE = 'LOCAL_FILE'
    LOCAL_DIR = 'LOCAL_DIR'
    URL_ARXIV = 'URL_ARXIV'
    URL_OTHER = 'URL_OTHER'
    NOT_FOUND = 'NOT_FOUND'


class Target:
    def __init__(self, input_str:str):
        self.input = input_str
        self.type = self.get_type(input_str)
        
        self.validate()
    
    def get_type(self, s:str) -> TargetType:
        if is_url(s):
            if urlparse(s).netloc == 'arxiv.org':
                return TargetType.URL_ARXIV
            else:
                return TargetType.URL_OTHER

        else:
            if pathlib.Path(s).is_file():
                return TargetType.LOCAL_FILE
            elif pathlib.Path(s).is_dir():
                return TargetType.LOCAL_DIR
            else:
                return TargetType.NOT_FOUND
    
    def validate(self):
        if self.type not in [TargetType.URL_ARXIV]:
            raise NotImplementedError(f"{self.type.name} targets")
            
    def fetch_target(self):
        if self.type == TargetType.URL_ARXIV:
            url_pdf, title = get_arxiv_pdf_url(self.input)

            dest_dir = pathlib.Path("~/Downloads/send_to_pb").expanduser()
            if not dest_dir.exists():
                u.logger.info(f"Creating target directory: {dest_dir}")
                dest_dir.mkdir(exist_ok=True)

            dest = dest_dir/title
            download_file(url_pdf, dest)
            self.fetched_path = dest
            return dest
            
        else:
            raise NotImplementedError(f"{self.type.name} targets")
        
    __repr__ = fastcore.utils.basic_repr('input, type')
    

In [None]:
# slow
input_str = 'https://arxiv.org/abs/2010.05365'
input_str

t = Target(input_str)
fetched_path = t.fetch_target()
fetched_path

2020-10-15 17:41:40.251 | DEBUG    | __main__:get_arxiv_pdf_url:9 - Getting ARXIV link to parse https://arxiv.org/abs/2010.05365
2020-10-15 17:41:42.248 | DEBUG    | __main__:get_arxiv_pdf_url:11 - OK
2020-10-15 17:41:42.301 | DEBUG    | __main__:get_arxiv_pdf_url:13 - Parsing https://arxiv.org/abs/2010.05365
2020-10-15 17:41:42.301 | DEBUG    | __main__:get_arxiv_pdf_url:16 - Title found: [2010.05365] ArXiving Before Submission Helps Everyone
2020-10-15 17:41:42.302 | DEBUG    | __main__:get_arxiv_pdf_url:19 - href to pdf found: /pdf/2010.05365
2020-10-15 17:41:42.303 | DEBUG    | __main__:download_file:2 - downloading to /home/david/Downloads/send_to_pb/[2010.05365] ArXiving Before Submission Helps Everyone.pdf
[2010.05365] ArXiving Before Submission Helps Everyone.pdf: 100%|██████████| 266k/266k [00:03<00:00, 87.6kiB/s] 


In [None]:
def test_func(f, args, want):
    if isinstance(args, (list, tuple)):
        fastcore.test.test_eq(f(*args), want)
    else:
        fastcore.test.test_eq(f(args), want)

In [None]:
test_func(is_url, "actually not a url", False)
test_func(is_url, "test_folder", False)
test_func(is_url, "test_folder/test_file.txt", False)
test_func(is_url, "http://google.com", True)
test_func(is_url, "https://arxiv.org/abs/2010.05365", True)

In [None]:
test_func(t.get_type, "http://google.com", TargetType.URL_OTHER)
test_func(t.get_type, "https://arxiv.org/abs/2010.05365", TargetType.URL_ARXIV)
test_func(t.get_type, "test_folder", TargetType.LOCAL_DIR)
test_func(t.get_type, "test_folder/test_file.txt", TargetType.LOCAL_FILE)
test_func(t.get_type, "test_folder/this_file_does_not_exist.txt", TargetType.NOT_FOUND)
test_func(t.get_type, "not a file at all", TargetType.NOT_FOUND)