metacurl

#!/usr/bin/env python3

import subprocess, os, hashlib
import tempfile
import urllib.parse
import re
import datetime, time
import dateutil.parser
import platform, sys
import argparse
import json
import sqlite3
import base64, binascii
import urllib.request, urllib.parse, urllib.error
import html.parser
import io
import codecs

###########################################################################
###########################################################################

# nonessential module
try:
    import xattr
except ImportError:
    pass # just ignore it for now

def set_extended_attr(fno,aname,avalue):
    # pyxattr API should work with python3
    try:
        xattr.set(fno,'user.%s'%(aname,),avalue.encode('utf-8'))
        return
    except:
        try:
            # XXX This is for Linux, OS X has a different module
            # and not with each installed python version - need to figure it out
            xattr._fsetxattr(fno,'user.%s'%(aname,),avalue.encode('utf-8'))
        except:
    #       print("fsetxattr %s failed"%(aname,))
            pass

# environment name for user's meta filelist directory
FL = 'FLMETADIR'
METAFILE = 'METAFILE'
METACOOKIES = 'METACOOKIES'
N_ALL = [
    'file', 'size', 'stamp', 'md5', 'sha1', 'sha256', 'githash', 'mimetype',
    'charset', 'encoding', 'url', 'stored', 'server', 'etag'
]

class MetaFLJSONEncoder(json.JSONEncoder):
    def encode(self, o):
        if isinstance(o, dict):
            return "{" + \
                ",".join(["%s:%s"%(self.encode(i),self.encode(o[i] if i in o else None)) \
                          for i in N_ALL]) + \
               "}"
        else:
            return json.JSONEncoder.encode(self, o)

###########################################################################
###########################################################################

def tokensplit(s):
    w = s.encode('UTF-8').strip(b'\x20\x09').decode('UTF-8')
    t = []
    while len(w)>0:
        m=re.match('^[\x20\x09]*([A-Za-z0-9\x21\x23\x24\x25\x26\x27\x2a\x2b\x2d\x2e\x5e\x5f\x60\x7c\x7e]+)[\x20\x09]*(;|=|$)[\x20\x09]*(.*?)$'
,w)

        if m:
            # end of token (or separated by ; from the next)
            if m.group(2)==';' or m.group(2) == '':
                t.append(m.group(1))
                w=m.group(3)
            else:
                (a,b)=(m.group(1),m.group(3))
                if b[0]=='"':
                    begp=1
                    endp=b.find('"',1)
                else:
                    begp=0
                    endp=b.find(';')
                if endp>=0:
                    t.append("%s=%s"%(a,b[begp:endp]))
                    w=b[endp+1:]
                else:
                    t.append("%s=%s"%(a,b[begp:]))
                    w=''
        else:
            t.append('%s'%(w,))
            w=''

    return t

class http_headerblock(object):
    """
    HTTP header block, parsed from a pipe
    http_headerblock(pipe).  Uses iteration,
    which cannot be then intermixed with plain
    read()

    Args:
        pipe (handle): an input file handle

    Attributes:
        http_ver (str): HTTP version token
        lines (list): list of header fields
    """

    def get_all_headers(self, name):
        """Get all headers matching the name, tokenized"""
        ## expects: values stripped of ows
        ## Returns:
        ## lines in a list
        ## containing token value pairs in a list
        ## [firsttoken, firstvalue or None, repeat of: token, value ]
        res = []
        try:
            for h in self.headers[name.upper()]:
                # review rfc6266
                # XXX handle semicolons in quotes ;token="what;ever"
                # XXX handle quotes?
                #okens = [t.strip('\x20\x09') for t in h.split(';')]
                tokens = tokensplit(h)
                line = []
                for t in tokens:
                    z = t.split('=', 1)
                    z.append(None)
                    line.extend(z[:2])
                res.append(line)
        except KeyError:
            pass
        return res

    def get_single_header(self, name):
        """Get a simple header, fail if multiple found"""
        try:
            allvalues = self.headers[name.upper()]
            if len(allvalues) != 1:
                return None
            return allvalues[0]
        except (KeyError):
            return None

    def get_duplicated_header(self, name):
        """Get a single header, if there are multiple headers, return
        only if they all match"""
        try:
            allvalues = self.headers[name.upper()]
            cl = allvalues[0]
            for s in allvalues[1:]:
                if s != cl:
                    # XXX write stderr fmt
                    print("!! ambiguous header %s (%s:%s)", name, s, cl)
                    return None
            return cl
        except KeyError:
            return None

    # content-length - may have several occurrences, but all those must match
    def get_content_length(self):
        a = self.get_all_headers('Content-Length')
        b = self.get_all_headers('X-Archive-Orig-Content-Length')
        if len(b)>0:
            a=a+b
        val = None

        try:
            val = int(a[0][0])
        except (ValueError, TypeError, IndexError):
            return None

        try:
            for l in a[1:]:
                if val != int(l[0]):
                    return None
        except (ValueError, TypeError):
            return None

        return val

    def get_chrono_header(self, name):
        """Retrieve time-specific field value"""
        # Thu, 05 Sep 2002 08:34:38 GMT
        # September, 08-Sun-02 13:51:43 GMT
        # Thursday, 19-Sep-02 13:41:10 GMT
        # November, 14-Thu-02 14:01:04 GMT
        # Monday, 25-Nov-102 12:49:26 GMT
        # Thu, 18 Dec 2003 01:32:51 GMT
        # Fri, 13 Apr 2018 19:07:57 GMT
        n = self.get_duplicated_header(name)
        try:
            tzi = {"UTC": 0, "GMT": 0}
            d = dateutil.parser.parse(n,tzinfos=tzi)
            t = int(d.strftime("%s"))
            return t
        except (ValueError, AttributeError, TypeError):
            return None

    def __str__(self):
        if self.http_ver:
            return "* [%s] %s %s %s\n" % (len(self.lines), self.http_ver,
                                          self.status_code, self.reason)
        else:
            return "?"

    def __init__(self, pipe):
        self.lines = None  #: list of header lines
        self.headers = {}
        self.http_ver = None  #: HTTP version token
        self.status_code = None  #: HTTP status code (3 digit)
        self.reason = None  #: HTTP reason phrase

        # https://tools.ietf.org/html/rfc7230#section-3.1.2
        # status-line = HTTP-version SP status-code SP reason-phrase CRLF
        l = ''
        for l in pipe:
            if l: l = l.decode('UTF-8').rstrip("\x20\x09\r\n")
            break
        else:
            return None

        try:
            (v, c, r) = l.split(None, 2)
        except ValueError:
            (v, c) = l.split(None, 1)
            r = 'None given'
        (self_http_ver, self.status_code, self.reason) = (v, int(c), r.strip())

        # https://tools.ietf.org/html/rfc7230#section-3.1.2
        # header-field   = field-name ":" OWS field-value OWS
        lines = []

        for l in pipe:
            if l: l = l.rstrip(b"\r\n").decode('UTF-8')
            if l == '':
                break
            # XXX folding not well tested
            if l[0] in "\x20\x09":
                l = l.strip("\x20\x09")
                if len(lines) > 0:
                    # not perfect for traditional Set-Cookie (no testing available)
                    lines[-1] = "%s %s" % (lines[-1], l)
                else:
                    # replicate bogus field
                    lines.append(" " + l)
            else:
                l = l.rstrip("\x20\x09")
                lines.append(l)

        # now parse the lines to header fields
        h = {}
        for l in lines:
            try:
                (f, v) = l.split(':', 1)
                # field-name shouldn't have any, but anyway
                f = f.strip("\x20\x09").upper()
                ## provides: values stripped of ows
                v = v.strip("\x20\x09")
                try:
                    h[f].append(v)
                except KeyError:
                    h[f] = [v]
            except ValueError:
                # XXX really bad, but stay robust
                pass

        self.lines = lines
        self.headers = h
        return


###########################################################################
###########################################################################


class singlefile(object):
    """
    Retrieve a single file by using curl and inspecting the output
    """

    def __init__(self, url, curlopts, debug=False, text=False, auth=None):
        self.error = "not initialized"
        self.progress = "downloading"
        self.orig_url = ""
        self.filename = None
        self.filesize = None
        self.hashes = []  # md5,sha1,sha256,*githash
        self.server = None
        self.etag = None
        self.mimetype = None
        self.charset = None
        self.encoding = None
        self.tmpname = None
        self.expected_md5 = None
        self.expected_sha1 = None
        self.expected_sha256 = None
        self.refer = None
        self.ext_url = None
        self.ext_args = []
        self.allow_text = text

        if url[-5:] == '.html':
            self.allow_text = True

        if debug:
            ANSI_OFF = ''
            ANSI_RED = ''
            ANSI_GREEN = ''
            ANSI_YEL = ''
            ANSI_EL = ''
            CR = '\n'
        else:
            ANSI_OFF = '\033[m\033[K'
            ANSI_RED = '\033[31m'
            ANSI_GREEN = '\033[32m'
            ANSI_YEL = '\033[38;5;11m'
            ANSI_EL = '\033[K' # tput el
            CR = '\r'

        if '#' in url:
            ## TODO parse fragment for hash spec
            [url, frag] = url.split('#', 1)
            url = url.split('#', 1)[0]
        else:
            frag = ''

        # check if any fixing is needed
        url, frag = self.canonical_url(url, frag)

        m = re.search('(md5|sha1|sha256)[A-Za-z]*=([A-Za-z0-9+/=]+)', frag, re.I)
        if m:
            if m.group(1).upper() == 'MD5':
                self.expected_md5 = m.group(2).lower()
            elif m.group(1).upper() == 'SHA1':
                self.expected_sha1 = m.group(2).lower()
            elif m.group(1).upper() == 'SHA256':
                hash = m.group(2)
                if len(hash)<64:
                    self.expected_sha256=binascii.hexlify(base64.b64decode(hash)).decode('ASCII').lower()
                else:
                    self.expected_sha256=hash.lower()

        # XXX also - multihash qm

        # save original url and extract possible filename (without parameters)
        if not self.orig_url:
            self.orig_url = url
        self.ext_url = url

        parts = url.split('?', 1)[0].split('/')
        # [:8] is new XXX
        if parts[-1] == '' or parts[-1][0] in '&?#' or parts[-1][:8] == 'download':
            hunk = parts[-2]
        else:
            hunk = parts[-1]

        try:
            x = urllib.parse.unquote(hunk)
            if x != hunk:
                hunk = x
        except:
            pass
        self.filename = hunk
        self.mimetype = None

        conn = None

        # initially, cursor is at the start of the line
        # progress is expected to keep the cursor mid-line (right after stats)
        while True:
            url, frag = self.canonical_url(url, frag)
            ###
            if self.orig_url != url and re.match(r'https?://t\.co/.*|.*kernel\.org/.*|.*downloads.sourceforge.net/.*',self.orig_url):
                print("forget",self.orig_url,url)
                self.orig_url = url

            sys.stderr.write(("-- %s %s %s\n"
                              if debug else "\r-- %s %s %s  \r") % (
                                  self.progress,
                                  url,
                                  ANSI_OFF, ))
            all_args = curlopts[:]
            all_args.extend(self.ext_args)
            if self.refer:
                all_args.extend([
                    '-H',
                    'Referer: %s' % (self.refer),
                ])

            if METACOOKIES in os.environ:
                cookie=os.environ[METACOOKIES]
                all_args.append('-b')
                all_args.append(cookie)
                all_args.append('-c')
                all_args.append(cookie)

            ### ftp has metadata, but curl does not pass them - fake them here
            if url[:3] == 'ftp':
                ftpconv = ''

                import ftplib
                import urllib.parse
                u = urllib.parse.urlparse(url)
                netloc = u.netloc
                auth_user = 'ftp'
                auth_pass = 'ftp@'
                if '@' in netloc:
                    (a,l) = netloc.rsplit('@',1)
                    netloc = l
                    if ':' in a:
                        (auth_user,auth_pass) = a.rsplit(':',1)
                    else:
                        auth_user = a
                # ParseResult(scheme='ftp', netloc='atrey.karlin.mff.cuni.cz:21', path='/pub/linux/pci/pciutils-3.5.6.tar.gz.sign', params='', query='', fragment='')
                all_args.append('-u')
                all_args.append('%s:%s'%(auth_user,auth_pass))
                url = 'ftp://%s%s'%(netloc,u.path.replace(' ','%20'))
                try:
                    ftpconn = ftplib.FTP(netloc)
                    res = ftpconn.login(auth_user, auth_pass)
                    print('* ftp',res.split("\n")[0])
                    # 230 Login successful.
                    if res[0] == '2':
                        firstline = res[4:].split('\n')[0].strip("\r")
                        ftpconv = ftpconv + 'HTTP/1.0 200 ' + firstline + '\r\n'

                    # 550 SIZE not allowed in ASCII mode
                    ftpconn.voidcmd('TYPE I')

                    ftpsz = ftpconn.size(u.path)
#                    res=ftpconn.sendcmd("SIZE %s"%(u.path))
                    print('* ftp size 213',ftpsz)
                    # 213 811
                    #if res[0] == '2':
                    ftpconv = ftpconv + 'Content-Length: ' + str(ftpsz) + '\r\n'
                    ftptm=ftpconn.sendcmd("MDTM %s"%(u.path))
                    print('* ftp time',ftptm)
                    # 213 20171117140512
                    if ftptm[0] == '2':
                        tm = ftptm[4:]
                        matchtm=re.match('^(\d+)(\d{2})(\d{2})(\d{2})(\d{2})(\d{2}(\.\d+|))$',tm)
                        z = datetime.datetime(int(matchtm.group(1)),int(matchtm.group(2)),int(matchtm.group(3)),
                                int(matchtm.group(4)),int(matchtm.group(5)),int(matchtm.group(6)),0,None)
                        ftpconv = ftpconv + 'Last-Modified: ' + z.strftime("%a, %d %b %Y %T GMT") + '\r\n'
                    ftpconn.quit()
                    ftpconn = None
                    ftpconv = ftpconv + '\r\n'
                    ftpconv = ftpconv.encode('UTF-8')
                except ftplib.all_errors as e:
                    print('* ftp',e)

                ftpstr = io.BytesIO(ftpconv)
                hdr = http_headerblock(ftpstr)
            else:
                hdr = None

            all_args.append('--')
            all_args.append(url)

            run_curl = subprocess.Popen(all_args, stdout=subprocess.PIPE)
            #sys.stderr.write("\n"+' | '.join(all_args)) # XXX

            if hdr is None:
                hdr = http_headerblock(run_curl.stdout)

            # if via proxy, there might be a first HTTP response about the connection
            if hdr.lines is not None and hdr.status_code is not None and len(hdr.lines) == 0 and hdr.status_code == 200:
                hdr = http_headerblock(run_curl.stdout)
            elif hdr.status_code is not None and (int(hdr.status_code) == 200 or int(hdr.status_code) == 150):
                if not debug:
                    sys.stderr.write("\r-- %s -- %s%s%s\n" %
                                     (url, ANSI_GREEN, str(hdr.status_code),
                                      ANSI_OFF))  # XXX
            elif hdr.status_code is not None and int(hdr.status_code) > 305:
                sys.stderr.write("\r-- %s -- %s%s%s\n" % (
                    url, ANSI_RED, str(hdr.status_code), ANSI_OFF))  # XXX
            else:
                sys.stderr.write("\r-- %s -- %s%s%s\n" % (
                    url, ANSI_OFF, str(hdr.status_code), ANSI_OFF))  # XXX

            if None == hdr.lines:
                ## XXX kill first?
                res = run_curl.wait()
                if res == 35:
                    self.error = "curl returned err %d, try without -k" % (
                        res, )
                elif res == 60:
                    self.error = "curl returned err %d, try with -k" % (res, )
                else:
                    self.error = "curl returned err %d" % (res, )
                break

            ## check if redirect
            if hdr.status_code >= 300 and hdr.status_code <  400:
                self.error = "redirecting"
                self.ext_args = []
                run_curl.kill()
                # TODO handle failure
                loc = hdr.headers['LOCATION'][0]
                self.refer, url = self.canonical_redirect(url, loc)
                self.progress = 'redirected to'
                # content not seen yet, restarting from new url
                continue

            ## prepare to process body
            expected_size = hdr.get_content_length()
            if expected_size == 0: expected_size = None

            #CONTENT-DISPOSITION inline; filename="WWW.pdf"
            # ['inline',None,'filename','"WWW.pdf"']
            # XXX RFC6266 / RFC5987
            # filename="EURO rates";
            # filename*=utf-8''%e2%82%ac%20rates
            # foo: bar; title*=iso-8859-1'en'%A3%20rates
            # inline; filename -- botched
            try:
                for l in hdr.get_all_headers('Content-Disposition'):
                    z = None
                    if len(l) >= 4 and l[2].upper().strip() == 'FILENAME':
                        z = l[3].strip('\x27\x22')
                    if len(l) >= 4 and l[2].upper().strip() == 'FILENAME*':
                        arr = l[3].strip().split("'")
                        if len(arr) > 2:
                            z = arr[2].strip('\x27\x22\x09\x20')
#                   z = bytearray(z).decode('utf-8')
                    if z:
                        z.replace('%20','_')
                        self.filename = z
            except (KeyError, TypeError, AttributeError,UnicodeError):
                # AttributeError when filename is not specified (None)
                pass

            ## debugging to seek for new headers containing information
            for n in sorted(hdr.headers):
                if ('LANGUAGE' in n) or \
                   (n[0:6] in ['ACCESS','ACCEPT','CACHE-','X-GUPL','X-CACH','PROXY-','X-FRAM','X-TIME']) or \
                   (n in ['SERVER','AGE','ALT-SVC','CONNECTION','VARY','CF-RAY','PRAGMA','VIA']) or \
                   (n in ['DATE','LAST-MODIFIED','EXPIRES']) or \
                   (n in ['CONTENT-LENGTH','CONTENT-TYPE','ETAG','SET-COOKIE','P3P']) or \
                   (n[0:10] in ['X-GOOG-MET','X-GOOG-STO','X-GOOG-GEN','STRICT-TRA','X-GOOG-HAS','X-AMZ-REQU','X-AMZ-ID-2']) or \
                   (n[0:10] in ['CF-CACHE-S','X-XSS-PROT','X-CONTENT-','X-POWERED-','X-UA-COMPA','TIMING-ALL','X-GITHUB-R']) or \
                   (n[0:10] in ['X-FASTLY-R','X-GEO-BLOC','CONTENT-SE','X-SERVED-B','SOURCE-AGE','TRANSFER-E']) or \
                   False:
                    continue
                if ('CONTENT-' in n) or \
                   ('LAST-' in n) or \
                   ('GOOG-' in n) or \
                   True:
                    sys.stderr.write("%s %s\n" % (n, ":".join(hdr.headers[n])))

            ## -- http://central.maven.org/maven2/com/lowagie/itext/2.1.7/itext-2.1.7.jar -- 200
            # X-CHECKSUM-SHA1 892bfb3e97074a61123b3b2d7caa2db112750864
            # X-CHECKSUM-MD5 7587a618197a065eac4a453d173d4ed6
            # XXX sha256
            # X-CHECKSUM-SHA2 712913a083f07dbacfdf5686364127ba8d457418458df0f6b899de8efa3d91af
            # X-AMZ-META-S3CMD-ATTRS atime:1554763479/ctime:1554763036/gid:100/gname:users/md5:b5beced41ccb9db2cd567e97d18cf5bf/mode:33188/mtime:1554762731/uid:1000/uname:andy
            if 'X-CHECKSUM-SHA1' in hdr.headers:
                d = hdr.get_single_header('X-Checksum-SHA1').lower()
                if self.expected_sha1:
                    if self.expected_sha1 != d:
                        sys.stderr.write(
                            "!! different SHA1 digests to check\n")
                else:
                    self.expected_sha1 = d

            if 'X-CHECKSUM-MD5' in hdr.headers:
                d = hdr.get_single_header('X-Checksum-MD5').lower()
                if self.expected_md5:
                    if self.expected_md5 != d:
                        sys.stderr.write("!! different MD5 digests to check\n")
                else:
                    self.expected_md5 = d

            ## http://download.system-cfg.com/f.php?h=2Z0Khs23&d=1
            ## CONTENT-MD5 6c3156d38333f715ad5cc0ed73feba19
            # XXX -- http://hackage.haskell.org/package/xmonad-0.15/xmonad-0.15.tar.gz -- 200
            # CONTENT-MD5 OkX/s6RkgtqNlIc8SKed+g==
            if 'CONTENT-MD5' in hdr.headers:
                d = hdr.get_single_header('Content-MD5')
                if len(d) == 32:
                    dig = d.lower()
                else:
                    dig = binascii.hexlify(base64.b64decode(d)).decode('ASCII').lower()
                if self.expected_md5:
                    if self.expected_md5 != dig:
                        sys.stderr.write("!! different MD5 digests to check\n")
                else:
                    self.expected_md5 = dig
            ## x-goog-hash: md5=<base64 of binary>
            try:
                for h in hdr.headers['X-GOOG-HASH']:
                    (t, d) = h.split('=', 1)
                    if t.strip().upper() == 'MD5':
                        dig = binascii.hexlify(base64.b64decode(d.strip())).decode('ASCII').lower()
                        if self.expected_md5:
                            if self.expected_md5 != dig:
                                sys.stderr.write(
                                    "!! different MD5 digests to check\n")
                        else:
                            self.expected_md5 = dig
            except (KeyError, ValueError, TypeError):
                pass

            # hack - sometimes MD5 is part of the etag
            # < Last-Modified: Tue, 18 Sep 2018 06:30:07 GMT
            # < ETag: "eaf82392603b92dae632cc0f356b08aa:1537252207"
            # XXX also try fetching stamp if not in last-modified

            try:
                self.etag = hdr.get_single_header('Etag').strip("\x22\x27")
                m = re.match('([0-9a-fA-F]+)($|:)', self.etag)
                if m and len(m.group(1)) == 32:
                    dig = m.group(1).lower()
                    if self.expected_md5 and self.expected_md5 != dig:
                        sys.stderr.write(
                                    "!! different MD5 digests to check\n")
                    else:
                        self.expected_md5 = dig
            except (AttributeError, TypeError):
                self.etag = None

            # https://github-production-release-asset-2e65be.s3.amazonaws.com/1234783/d5b56b20-6e78-11e7-8593-067daae6ef3b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20170909%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20170909T090834Z&X-Amz-Expires=300&X-Amz-Signature=7870d67107a365117aa8b10ab65625a87bf634b1021bea07e5ba1dfaf0e077b6&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dmosh-1.3.2.pkg&response-content-type=application%2Foctet-stream

            # correct filename if needed
            self.filename = self.canonical_filename(self.filename, url)

            ## try to resume? XXX allow-resume
            ## -C, --continue-at <offset>

            ## prepare hashing (git hash needs size beforehand)
            hash_md5 = hashlib.md5()
            hash_sha1 = hashlib.sha1()
            hash_sha256 = hashlib.sha256()
            hash_githash = None
            if expected_size:
                hash_githash = hashlib.sha1()
                hash_githash.update(b"blob %d\0" % (expected_size))

            ## start downloading to a temporary file
            if hdr.status_code != 200:
                self.error = "HTTP status %d %s" % (hdr.status_code,
                                                    hdr.reason)
                sys.stderr.write("!! %s%s%s" %
                                 (ANSI_RED, self.error, ANSI_OFF))
                f = tempfile.mkstemp(
                    prefix='.err%d' % (hdr.status_code), dir='.')
            else:
                self.error = "downloading"
                f = tempfile.mkstemp(prefix='.dl_', dir='.')
            self.tmpname = f[1]
            f = os.fdopen(f[0], "wb")
            set_extended_attr(f.fileno(),'origurl',self.orig_url)

            ## TODO progress
            counted_size, i, x = 0, 0, 99
            shortened_url = self.orig_url[0:100]
            if self.filename not in shortened_url:
                shortened_name = ' .../' + self.filename[-95:]
                shortened_url = shortened_url[:120 - len(shortened_name
                                                         )] + shortened_name
            if not debug:
                sys.stderr.write("\r %15s %s%s \r" %
                                 ('', shortened_url, ANSI_OFF))
            ## in 2.x it's not possible to switch to non-line delimited reads
            ## XXX try to find a way to make the transfer efficient
            for buf in run_curl.stdout:
                f.write(buf)
                counted_size = counted_size + len(buf)
                z = time.gmtime().tm_sec
                if x != z and not debug:
                    i = i + 1
                    x = z
                    if expected_size:
                        perc = '%3d%%|%-9d' % (100.0 * counted_size /
                                               expected_size, expected_size)
                    else:
                        perc = '%-14d' % (counted_size)
                    sys.stderr.write("\r %s%s \r" % ("_oOo" [i % 4], perc))
                hash_md5.update(buf)
                hash_sha1.update(buf)
                hash_sha256.update(buf)
                if hash_githash: hash_githash.update(buf)

            # XXX closing - set xattr before, while we have the handle
            # XXX needs to get the hashes finalised before too
            # set_extended_attr(f.fileno(),'origurl',self.orig_url) # XXX xattr
            f.close()

            ## need to have the timestamp before renaming
            ## LAST-MODIFIED Wed, 21 Dec 2016 03:44:53 GMT
            ##                   1482291893
            ## X-GOOG-GENERATION 1482291893643481
            ## MEMENTO-DATETIME Thu, 16 Oct 2008 15:18:32 GMT
            ## X-ARCHIVE-ORIG-DATE Thu, 16 Oct 2008 15:18:32 GMT
            ## X-ARCHIVE-ORIG-ETAG "2967404-3200700"
            ## X-ARCHIVE-ORIG-LAST-MODIFIED Wed, 06 Feb 2008 03:08:53 GMT
            ## X-ARCHIVE-ORIG-CONTENT-LENGTH 12062338

            try:
                self.stamp = hdr.get_chrono_header('Last-Modified')
            except:
                self.stamp = None
            if self.stamp == -1: self.stamp = None
            # this overrides the naive stamp
            z = hdr.get_chrono_header('X-Archive-Orig-Last-Modified')
            if z:
                 self.stamp = z
            # if it's coming from G* with a higher resolution time
            try:
                z = float(hdr.get_single_header('X-GOOG-Generation'))
                if abs(z / 1000000.0 - self.stamp) < 5:
                    self.stamp = z / 1000000.0
            except (ValueError, TypeError):
                pass

            self.stored = int(time.time())
            try:
                ## save server's idea of the time if not too far off
                d = hdr.get_chrono_header('Date')
                if abs(d - self.stored) < 7200:
                    self.stored = d
            except:
                pass

            # check if download is complete
            if hdr.status_code == 200:
                statbuf = os.stat(self.tmpname)
                if statbuf.st_size != counted_size:
                    counted_size = statbuf.st_size
                if expected_size is None or counted_size == expected_size:
                    self.filesize = int(counted_size)
                    self.error = None
                    ## success
                elif expected_size is not None:
                    if counted_size < expected_size:
                        self.error = "loaded only %d bytes of %d" % (
                            counted_size, expected_size)
                        os.remove(self.tmpname)
                        self.tmpname = None
                        break
                    else:
                        # XXX hack (help force)
                        self.error = None
                        self.filesize = counted_size
                        sys.stderr.write("?? read more (%d of %d)\n" %
                                         (counted_size, expected_size))
            else:
                # XXX did remove, but may need for redirect parsing
                break

            self.server = hdr.get_single_header('Server')

            # store Content-Type: mimetype; charset=whatever
            try:
                for l in hdr.get_all_headers('Content-Type'):
                    self.mimetype = l[0].strip()
                    if len(l) > 3 and l[2].upper().strip() == 'CHARSET':
                        self.charset = l[3].strip()
            except (TypeError, IndexError):
                self.charset = None

            self.encoding = hdr.get_single_header('Content-Encoding')
            z = hdr.get_single_header('X-GOOG-Stored-Content-Encoding')
            if z and self.encoding is None:
                self.encoding = z

            self.hashes = [
                hash_md5.hexdigest(), hash_sha1.hexdigest(),
                hash_sha256.hexdigest(), None
            ]
            if hash_githash: self.hashes[3] = hash_githash.hexdigest()

            self.refer, url = self.content_redirect(url, self.tmpname,
                                                    self.mimetype)
            if url is None:
                break
            # else start again

        return None

    def canonical_url(self, url, frag):
        return url, frag

    def canonical_redirect(self, url, loc):
        orig_url = url
        # unsure if correct
        loc = loc[:]
        if ' ' in loc: loc = loc.replace(' ', '%20')
        url = urllib.parse.urljoin(url, loc)
        return orig_url, url

    def canonical_filename(self, filename, url):
        if filename[-4:].upper() == '.PDF':
            return filename[:-4] + '.pdf'
        x = url.split('/')[-1]
        if x[-4:].upper() == '.PDF':
            return x[:-4] + '.pdf'
        return filename

    def content_redirect(self, url, tmpname, mimetype):
        return None, None

    def savefile(self, filename=None, debug=False):
        if debug:
            ANSI_OFF = ''
            ANSI_RED = ''
            ANSI_GREEN = ''
            ANSI_YEL = ''
            ANSI_EL = ''
            CR = '\n'
        else:
            ANSI_OFF = '\033[m\033[K'
            ANSI_RED = '\033[31m'
            ANSI_GREEN = '\033[32m'
            ANSI_YEL = '\033[38;5;11m'
            ANSI_EL = '\033[K' # tput el
            CR = '\r'

        if self.error and not debug:
            sys.stderr.write("!! cannot save, %s%s%s\n" %
                             (ANSI_RED, self.error, ANSI_OFF))
            return None
        if not self.tmpname:
            return None

        # step - find a filename (or a duplicate prefix to make it unique)
        if filename is None:
            filename = self.filename

        # XXX todo
        # duplicate by hash?
        # duplicate by name on fs elsewhere
        # duplicate by name in db (case-insensitive)
        # XXX todo

        # check if name already in use
        dupprefix = ''
        try:
            # break out via OSError if file with that name does not exist
            os.stat(dupprefix + filename)
            dupprefix = 'dup.'
            os.stat(dupprefix + filename)
            dupprefix = 'dup.%d.' % (os.getpid(),)
            os.stat(dupprefix + filename)
            return None
        except OSError:
            pass
        filename = dupprefix+filename

        # XXX try to match with constraints instead?
        if self.filename[-4:].upper() == '.PHP':
            sys.stderr.write("!! not saving, .php extension\n")
            return None

        first20 = None
        with open(self.tmpname,'rb') as checkf:
            first20 = checkf.read(20)
        try:
            if '<!DOCTYPE' in first20.decode('utf-8').upper():
                self.mimetype = 'text/html'
        except UnicodeDecodeError:
            pass

        if self.mimetype and self.mimetype.upper() == 'TEXT/HTML' and not self.allow_text:
            sys.stderr.write("!! not saving, MIME type is %s%s\n" %
                             (self.mimetype, ANSI_OFF))
            # XXX grep WARNING
            return None
        if self.mimetype and self.mimetype.upper()[0:5] == 'TEXT/' and not self.allow_text:
        # -- https://gnupg.org/ftp/gcrypt/npth/npth-1.5.tar.bz2 -- 200
        # !! not saving, MIME type is text/plain (npth-1.5.tar.bz2)
            if '/raw.githubusercontent' in self.orig_url:
                pass
            elif self.filename[-7:].upper() == '.TAR.GZ':
                self.mimetype='application/x-gzip'
            elif self.filename[-7:].upper() == '.TAR.XZ':
                self.mimetype='application/x-xz'
            elif self.filename.split('.')[-1].upper() == 'BZ2':
                self.mimetype='application/x-bzip2'
            elif self.filename.split('.')[-1].upper()[:6] == 'SHA256':
                self.mimetype='application/octet-stream'
            elif self.filename.split('.')[-1].upper() == 'ADF':
                if self.filesize < 819200:
                    sys.stderr.write("!! not saving, ADF %s size %d\n"%(self.filename,self.filesize,))
                self.mimetype='application/octet-stream'
            elif self.filename.split('.')[-1].upper() == 'PDF':
                if self.filesize < 8192:
                    sys.stderr.write("!! not saving, PDF %s size %d\n"%(self.filename,self.filesize,))
                    return None
                if '%PDF' in first20.decode('utf-8').upper():
                    pass
                if '<!DOCTYPE' in first20.decode('utf-8').upper():
                    sys.stderr.write('!! not saving, %s looks HTML instead of PDF\n'%(self.filename,))
                    return None
                    checkf.close()
                self.mimetype='application/pdf'
            elif self.filename.split('.')[-1].upper(
            ) not in ['SIG', 'SIGN', 'PUB', 'ASC', 'TXT', 'C', 'H', 'CSV', 'S', 'ASM', 'MD5','SHA1','SHA256','SHA512','DIFF','PATCH','SUM','SUMS']:
                sys.stderr.write("!! not saving, MIME type is %s (%s) %s\n" %
                                 (self.mimetype, self.filename, ANSI_OFF))
                return None

        if self.mimetype and self.mimetype[0:5].upper() == 'IMAGE' and '/img/' in self.orig_url:
                sys.stderr.write("!! not saving, MIME type is %s (%s) %s\n" %
                                 (self.mimetype, self.filename, ANSI_OFF))
                return None


        # swift-swift-5.0.0.tar.gz really?
        splitfnarr=filename.split('-',2)
        if len(splitfnarr)>2:
            if splitfnarr[0] == splitfnarr[1]:
                filename='-'.join(splitfnarr[1:])

        # postponing these - the temp file won't be seen as proper
        if self.stamp:
            os.utime(self.tmpname, (self.stamp, self.stamp))
        os.chmod(self.tmpname, 0o644)

        self.filename = filename.replace('/','_')
        print(self.filename,self.tmpname,ANSI_EL)
        os.rename(self.tmpname, self.filename)
        self.tmpname = None
        bad = []
        good = []

        # XXX xattr

        # check digests
        if self.expected_md5:
            if self.expected_md5 == self.hashes[0]:
                good.append('MD5')
            else:
                bad.append('MD5')
        if self.expected_sha1:
            if self.expected_sha1 == self.hashes[1]:
                good.append('SHA1')
            else:
                bad.append('SHA1')
        if self.expected_sha256:
            if self.expected_sha256 == self.hashes[2]:
                good.append('SHA256')
            else:
                bad.append('SHA256')
        # report digest verficiation
        report_color = ANSI_OFF
        if len(bad):
            report_color = ANSI_RED
            ok = 'BAD ' + ' '.join(bad)
        else:
            report_color = ANSI_GREEN
            good.append('OK')
            ok = ' '.join(good)
        # xxx no LF when debug
        if not self.filesize:
            self.filesize = 0
        sys.stderr.write(
            "%s%s%s%s%s %s%s%s (%d)%s\n" %
            (CR, ("\n" if len(bad) and not debug else ""), ANSI_YEL, filename, ANSI_OFF, report_color, ok, ANSI_OFF, self.filesize, ANSI_EL))

        ## 1/4 write .meta as text with some fields
        ## order cannot be changed - append only (or rather ignore)

        ## don't store extra precision if not present
        if self.stamp:
            safestamp = '%f' % (self.stamp, )
        else:
            safestamp = self.stamp
        try:
            (i, frac) = safestamp.split('.', 1)
            if int(frac) == 0:
                safestamp = int(i)
            else:
                safestamp = float(safestamp.rstrip('0'))
        except (ValueError, AttributeError):
            pass

        try:
          self.filename = self.filename.encode('utf-8')
        except (UnicodeError):
          pass

        metafile = '.meta'
        if not os.path.isfile(metafile) and METAFILE in os.environ:
            metafile = os.environ[METAFILE]

        with codecs.open(metafile, encoding='utf-8', mode='a') as f:
            f.write("\t".join([str(i) if i else '?' for i in \
              [self.filename.decode('UTF-8'),self.filesize,safestamp,\
               self.hashes[0],self.mimetype,self.encoding,\
               self.orig_url,self.stored,self.server,self.etag]])+"\n")
            f.close()

        ## 2/4 avoid further records if duplicate
        ## todo - make unique file before saving
        conn, cur = None, None
        try:
            if conn == None:
                conn = sqlite3.connect(os.environ['FLDB'])
                cur = conn.cursor()
        except:
            conn, cur = None, None


        # XXX check duplicate in zip
        # Archive:  whatever.zip
        # Length   Method    Size  Ratio   Date   Time   CRC-32    Name
        # --------  ------  ------- -----   ----   ----   ------    ----
        # 11984777  Defl:X 11366006   6%  08-11-11 17:23  6111b8e4  whatever.pdf
        # --------          -------  ---                            -------
        # 11984777         11366006   6%                            1 file
        # genattr whatever.pdf
        # user.crc32="6111b8e4"
        # user.length="11984777"

        if conn:
            res = conn.execute("""
            select path,fn from meta where
            md5 = ?
            """,(self.hashes[0],))
            if res:
                rescnt=0
                for q in res:
                    if rescnt == 0:
                        dupmsg = 'not storing dup of %s in %s'%(q[1],q[0],)
                        # XXX rename? check hash db NNN1
                    rescnt = rescnt+1
                if rescnt > 1:
                    dupmsg = '%s (%d in total)'%(dupmsg,rescnt,)
                if rescnt:
                    if filename[:4] == 'dup.':
                        print('!!',dupmsg)
                    else:
                        dupprefix = 'dup.'
                        try:
                            # XXX check1 if duplicate by hash - try to not save
                            # break out via OSError if file with that name does not exist
                            os.stat(dupprefix + filename)
                            dupprefix = 'dup.%d.' % (os.getpid(),)
                            os.stat(dupprefix + filename)
                            return None
                        except OSError:
                            pass
                        os.rename(filename,dupprefix+filename)
                        print('!!',dupmsg,'--',dupprefix+filename)
                    return None

        ## 3/4 write per-user/host meta - may fail silently
        ## can be extended interchangeably
        try:
            year = time.gmtime(self.stored).tm_year
        except TypeError:
            year = time.gmtime(time.time()).tm_year
        d={'file':self.filename.decode('UTF-8'),'size':self.filesize,'stamp':safestamp,\
        'md5':self.hashes[0],'sha1':self.hashes[1],'sha256':self.hashes[2],\
        'githash':self.hashes[3],'mimetype':self.mimetype,'charset':self.charset,\
        'encoding':self.encoding,'url':self.orig_url,\
        'stored':self.stored,'server':self.server,'etag':self.etag}
        try:
            with open('%s/%04d-%s.meta' % (os.environ[FL], year,
                                           platform.node()[0:6]),
                      "a") as flmeta:
                flmeta.write("%s\n" % (json.dumps(d, cls=MetaFLJSONEncoder)))
                flmeta.close()
        except:
            pass
        if debug:
            sys.stderr.write("%s\n" % (json.dumps(d, cls=MetaFLJSONEncoder)))

        ## 4/4 update database
        try:
            if conn == None:
                conn = sqlite3.connect(os.environ['FLDB'])
                cur = conn.cursor()
        except:
            conn, cur = None, None

        if conn:
            conn.execute("""
            insert into meta
            (path,fn,size,stamp,md5,sha1,sha256,githash,mimetype,mimecs,mimeenc,url,stored,server,etag)
            values(?,?,?,datetime(?,'unixepoch'),?,?,?,?,?,?,?,?,datetime(?,'unixepoch'),?,?)""",\
            (os.getcwd(),self.filename.decode('UTF-8'),self.filesize,self.stamp,\
            self.hashes[0],self.hashes[1],self.hashes[2],self.hashes[3],\
            self.mimetype,self.charset,self.encoding,self.orig_url,\
            self.stored,self.server,self.etag))
            conn.commit()

        if conn:
            conn.close()

    def __del__(self):
        try:
            if self.tmpname:
                os.remove(self.tmpname)
                self.tmpname = None
        except OSError:
            pass


###########################################################################
###########################################################################


def main():
    parser = argparse.ArgumentParser(
        description='Wrapper for curl to keep HTTP metadata', prog='metacurl')
    ## grep E\} src/tool_getparam.c|sed 's/[{} "]//g'
    # vvv
    ## Oa,remote-name-all,FALSE,
    ## r,range,TRUE,
    ## R,remote-time,FALSE,
    ## s,silent,FALSE,
    ## S,show-error,FALSE,
    # i,include,FALSE,
    # k,insecure,FALSE,
    # K,config,TRUE,
    # H,header,TRUE,
    # Hp,proxy-header,TRUE,
    # D,dump-header,TRUE,
    parser.add_argument('-i', '--include', action='store_true')
    parser.add_argument('--remote-name-all', action='store_true')
    parser.add_argument('-b', '--cookie')
    parser.add_argument('-c', '--cookie-jar')
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument('-N', '--no-buffer', action='store_true')
    parser.add_argument('-s', '--silent', action='store_true')
    parser.add_argument('-r', '--range')
    parser.add_argument('-g', '--globoff') #       Disable URL sequences and ranges using {} and []

    parser.add_argument('-k', '--insecure', action='store_true')
    parser.add_argument('-K', '--config')
    parser.add_argument('-H', '--header', action='append')
    parser.add_argument('--data-binary')
    parser.add_argument('--data')
    parser.add_argument('--proxy-header', action='store_true')
    parser.add_argument('--compressed', action='store_true')
    parser.add_argument('--socks5-hostname')
    parser.add_argument('-D', '--dump-header')
    parser.add_argument('-x', '--proxy')
    parser.add_argument('-0', '--http1.0', action='store_true')
    # ^^^
    # --2.0 ignored for compatibility
    # -s should be always there (silent)
    # -o should not be present (output)
    parser.add_argument(
        '--2.0', action='store_const', const=False, default=False)
    parser.add_argument(
        '--1.0', action='store_const', const=False, default=False)
    parser.add_argument('--curl-command', default='curl')
    parser.add_argument('--debug-summary', action='store_true')
    parser.add_argument('--text', action='store_true')
    # debug?
    parser.add_argument('URL', nargs='+')
    args = parser.parse_args()

    curl_args = [args.curl_command]

    ## default arguments
    # User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0
    need_headers = {
        'DNT': 'DNT: 1',
        'ACCEPT-ENCODING': 'gzip, deflate',
    }
    # --silent and --include needs to be there always
    # along with --globoff
    curl_args.extend(['-s', '-i', '-g'])
    # try to add proxying unconditionally
    try:
        curl_args.extend(['--socks5-hostname', os.environ['SOCKS5']])
    except:
        pass
    # add those headers if not present
    add_headers = need_headers.copy()
    try:
        headers = args.__dict__['header']
        if headers is None:
            raise TypeError
        for h in headers:
            k = h.split(':', 1)
            k = k[0].upper().strip('\x20\x09')
            if k in add_headers:
                del add_headers[k]
    except:
        args.__dict__['header'] = []

    for k, v in add_headers.items():
        args.__dict__['header'].append(v)

    for i, v in list(args.__dict__.items()):
        if isinstance(v, list) and i in ['header', ]:
            for n in v:
                curl_args.extend(['--%s' % (i), '%s' % (n, )])
        elif i in [
                'silent', 'include', 'globoff',
                '1_0', '2_0', 'dump_header',
                'text',
                'curl_command', 'debug_summary', 'URL'
        ]:
            # silent/include/globoff will be added always, the rest will be skipped
            pass
        elif v == True:
            curl_args.append('--%s' % (i.replace('_', '-'), ))
        elif v:
            curl_args.extend(['--%s' % (i.replace('_', '-')), '%s' % (v, )])

    for url in args.URL:
        savedfile = singlefile(url, curl_args, args.debug_summary, args.text)
        savedfile.savefile(debug=args.verbose or args.debug_summary)


###########################################################################
###########################################################################


def curlopts():
    pass


###########################################################################
###########################################################################

if __name__ == '__main__':
    main()