Skip to content
Permalink
Browse files Browse the repository at this point in the history
Refactor content downloading and provisionning
We needed to rework the DB according to issue with file having
the same hash but not the same name. E.g.:

http://snapshot.debian.org/archive/debian/20210825T203108Z/pool/main/e/e2fsprogs/e2fsprogs_1.45.7.orig.tar.gz{,.asc}

Closes #4
  • Loading branch information
fepitre committed Sep 14, 2021
1 parent f846875 commit 572309a
Show file tree
Hide file tree
Showing 4 changed files with 709 additions and 613 deletions.
48 changes: 25 additions & 23 deletions api/snapshot_api.py
Expand Up @@ -27,7 +27,7 @@
from flask_sqlalchemy import SQLAlchemy
from dateutil.parser import parse as parsedate
from db import DBarchive, DBtimestamp, DBfile, DBsrcpkg, DBbinpkg, \
FilesLocations, BinpkgFiles, ArchivesTimestamps, DATABASE_URI
HashesLocations, BinpkgFiles, ArchivesTimestamps, DATABASE_URI

# flask app
app = Flask("DebianSnapshotApi")
Expand Down Expand Up @@ -55,23 +55,26 @@ class SnapshotEmptyQueryException(SnapshotException):
pass


def file_desc(file):
def file_desc(h):
locations = []
for raw_location in db.session.query(FilesLocations).filter_by(file_sha256=file.sha256):
for r in db.session.query(HashesLocations, DBfile.name, DBfile.path, DBfile.size)\
.filter_by(sha256=h.sha256)\
.join(DBfile, DBfile.sha256 == h.sha256)\
.all():
timestamp_ranges = []
for rg in raw_location[4]:
for rg in r.timestamp_ranges:
timestamp_ranges.append(
(parsedate(rg[0]).strftime("%Y%m%dT%H%M%SZ"),
parsedate(rg[-1]).strftime("%Y%m%dT%H%M%SZ"))
)
location = {
"name": file.name,
"path": file.path,
"size": file.size,
"name": r.name,
"path": r.path,
"size": r.size,

"archive_name": raw_location[1],
"suite_name": raw_location[2],
"component_name": raw_location[3],
"archive_name": r.archive_name,
"suite_name": r.suite_name,
"component_name": r.component_name,
"timestamp_ranges": timestamp_ranges
}
locations.append(location)
Expand Down Expand Up @@ -153,12 +156,12 @@ def file_info(file_hash):
try:
# we have only one file because we use sha256 as hash
# compared to snapshot.d.o
file = db.session.query(DBfile).get(file_hash)
if not file:
files = db.session.query(DBfile).filter_by(sha256=file_hash).all()
if not files:
raise SnapshotEmptyQueryException
status_code = 200
api_result.update({
"result": file_desc(file),
"result": [file_desc(f) for f in files],
})
except SnapshotEmptyQueryException:
status_code = 404
Expand Down Expand Up @@ -223,12 +226,12 @@ def srcfiles(srcpkgname, srcpkgver):
api_result.update({
"package": srcpkgname,
"version": srcpkgver,
"result": [{"hash": file.sha256} for file in package.files],
"result": [{"hash": file.sha256} for file in package.hashes],
})
if fileinfo == "1":
api_result["fileinfo"] = {}
for file in package.files:
api_result["fileinfo"][file.sha256] = file_desc(file)
for h in package.hashes:
api_result["fileinfo"][h.sha256] = file_desc(h)
except SnapshotEmptyQueryException:
status_code = 404
except Exception as e:
Expand Down Expand Up @@ -278,16 +281,15 @@ def binfiles(pkg_name, pkg_ver):
"binary": pkg_name,
"result": [
{
"hash": associated_file.file_sha256,
"hash": associated_file.sha256,
"architecture": associated_file.architecture
} for associated_file in binpackage.files
} for associated_file in binpackage.hashes
],
})
if fileinfo == "1":
api_result["fileinfo"] = {}
for associated_file in binpackage.files:
file = associated_file.file
api_result["fileinfo"][file.sha256] = file_desc(file)
for h in binpackage.hashes:
api_result["fileinfo"][h.sha256] = file_desc(h)
except SnapshotEmptyQueryException:
status_code = 404
except Exception as e:
Expand Down Expand Up @@ -317,8 +319,8 @@ def upload_buildinfo():
name = dep[0]['name']
_, version = dep[0]['version']
arch = dep[0]['arch'] or parsed_info['Build-Architecture']
results = db.session.query(BinpkgFiles.architecture, FilesLocations)\
.join(BinpkgFiles, BinpkgFiles.file_sha256 == FilesLocations.c.file_sha256)\
results = db.session.query(BinpkgFiles.architecture, HashesLocations)\
.join(BinpkgFiles, BinpkgFiles.sha256 == HashesLocations.c.sha256)\
.filter_by(binpkg_name=name, binpkg_version=version).all()
if len(results) == 0:
not_found.append((name, version, arch))
Expand Down
80 changes: 23 additions & 57 deletions db.py
Expand Up @@ -17,7 +17,8 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from sqlalchemy import Column, Integer, BigInteger, String, ARRAY, Table, ForeignKey, ForeignKeyConstraint
from sqlalchemy import Column, Integer, BigInteger, String, ARRAY, Table, ForeignKey, \
ForeignKeyConstraint, UniqueConstraint
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, relationship
Expand Down Expand Up @@ -47,12 +48,13 @@ def db_create_session(readonly=False):
# Association tables


FilesLocations = Table(
'files_locations', Base.metadata,
Column('file_sha256', String, ForeignKey('files.sha256'), primary_key=True),
HashesLocations = Table(
'hashes_locations', Base.metadata,
Column('sha256', String, ForeignKey('hashes.sha256'), primary_key=True),
Column('archive_name', String, ForeignKey('archives.name'), primary_key=True),
Column('suite_name', String, ForeignKey('suites.name'), primary_key=True),
Column('component_name', String, ForeignKey('components.name'), primary_key=True),
Column('component_name', String, ForeignKey('components.name'), primary_key=True),
Column('timestamp_ranges', ARRAY(String), nullable=False)
# timestamp_ranges is an array of ranges. A range is defined as an array
# of two representing begin/end interval among of all available timestamps
Expand All @@ -66,11 +68,12 @@ def db_create_session(readonly=False):
Column('timestamp_value', String, ForeignKey('timestamps.value'), primary_key=True),
)


SrcpkgFiles = Table(
'srcpkg_files', Base.metadata,
Column('srcpkg_name', String, primary_key=True),
Column('srcpkg_version', String, primary_key=True),
Column('file_sha256', String, ForeignKey('files.sha256'), primary_key=True),
Column('sha256', String, ForeignKey('hashes.sha256'), primary_key=True),
ForeignKeyConstraint(
('srcpkg_name', 'srcpkg_version'),
('srcpkg.name', 'srcpkg.version')),
Expand All @@ -87,9 +90,9 @@ class BinpkgFiles(Base):
)
binpkg_name = Column(String, primary_key=True)
binpkg_version = Column(String, primary_key=True)
file_sha256 = Column(String, ForeignKey('files.sha256'), primary_key=True)
sha256 = Column(String, ForeignKey('hashes.sha256'), primary_key=True)
architecture = Column(String, ForeignKey('architectures.name'), primary_key=True)
file = relationship("DBfile")
hashes = relationship("DBhash")


# Main tables
Expand Down Expand Up @@ -144,13 +147,23 @@ def __repr__(self):
return f"<Architecture {self.name}>"


class DBhash(Base):
__tablename__ = 'hashes'
sha256 = Column(String(64), primary_key=True)

def __repr__(self):
return f"<Hash {self.sha256}>"


class DBfile(Base):
__tablename__ = 'files'

sha256 = Column(String(64), primary_key=True)
id = Column(Integer, primary_key=True)
sha256 = Column(String(64), ForeignKey('hashes.sha256'))
size = Column(BigInteger, nullable=False)
name = Column(String, nullable=False)
path = Column(String, nullable=False)
__table_args__ = (UniqueConstraint('sha256', 'name', 'size'),)

def __repr__(self):
return f"<File {self.sha256}>"
Expand All @@ -161,7 +174,7 @@ class DBsrcpkg(Base):

name = Column(String, primary_key=True)
version = Column(String, primary_key=True)
files = relationship("DBfile", secondary=SrcpkgFiles)
hashes = relationship("DBhash", secondary=SrcpkgFiles)

def __repr__(self):
return f"<Package {self.name}-{self.version}>"
Expand All @@ -172,54 +185,7 @@ class DBbinpkg(Base):

name = Column(String, primary_key=True)
version = Column(String, primary_key=True)
files = relationship("BinpkgFiles")
hashes = relationship("BinpkgFiles")

def __repr__(self):
return f"<Binary {self.name}-{self.version}>"


# Temporary tables for DB provisioning


class DBtempfile(Base):
__tablename__ = 'tempfiles'
__table_args__ = {'prefixes': ['UNLOGGED']}

sha256 = Column(String(64), primary_key=True)
size = Column(BigInteger, nullable=False)
name = Column(String, nullable=False)
path = Column(String, nullable=False)
archive_name = Column(String, primary_key=True)
timestamp_value = Column(String, primary_key=True)
suite_name = Column(String, primary_key=True)
component_name = Column(String, primary_key=True)

def __repr__(self):
return f"<TempFile {self.sha256}>"


class DBtempsrcpkg(Base):
__tablename__ = 'tempsrcpkg'
__table_args__ = {'prefixes': ['UNLOGGED']}

srcpkg_id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
version = Column(String, nullable=False)
file_sha256 = Column(String, nullable=False)

def __repr__(self):
return f"<TempPackage {self.name}-{self.version}>"


class DBtempbinpkg(Base):
__tablename__ = 'tempbinpkg'
__table_args__ = {'prefixes': ['UNLOGGED']}

binpkg_id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
version = Column(String, nullable=False)
file_sha256 = Column(String, nullable=False)
architecture = Column(String, nullable=False)

def __repr__(self):
return f"<TempBinary {self.name}-{self.version}>"
9 changes: 6 additions & 3 deletions lib/downloads.py
Expand Up @@ -107,7 +107,7 @@ def download_with_retry(url, path, sha256=None):
# os.remove(tmp_path)
raise Exception(f"{os.path.basename(url)}: wrong SHA256: {tmp_sha256} != {sha256}")
os.rename(tmp_path, path)
return path
return sha256


@retry(
Expand Down Expand Up @@ -151,13 +151,16 @@ def download_with_retry_and_resume(url, path, timeout=30, sha256=None, no_clean=
os.remove(tmp_path)
raise Exception(f"{fname}: wrong SHA256: {tmp_sha256} (expected: {sha256})")
os.rename(tmp_path, path)
sha256 = tmp_sha256
elif file_size == -1:
raise Exception(f"{f}: failed to get 'Content-Length': {url}")

return sha256


def download_with_retry_and_resume_threshold(url, path, size=None, sha256=None, no_clean=False):
# For file less than MAX_DIRECT_DOWNLOAD_SIZE we do a direct download
if size is not None and int(size) <= MAX_DIRECT_DOWNLOAD_SIZE * 1000 * 1000:
download_with_retry(url, path, sha256=sha256)
return download_with_retry(url, path, sha256=sha256)
else:
download_with_retry_and_resume(url, path, sha256=sha256, no_clean=no_clean, file_size=size)
return download_with_retry_and_resume(url, path, sha256=sha256, no_clean=no_clean, file_size=size)

0 comments on commit 572309a

Please sign in to comment.