diff --git a/README.md b/README.md index 0b32785..c90dd53 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ It is a package that supports common utils used by FOSSLight Scanner. 3. It provides a simple function to create a text file. 4. It defines common constant variables. 5. It provides a thread that prints the spinner. +6. Download source code. [or]: http://collab.lge.com/main/x/xDHlFg @@ -110,6 +111,27 @@ timer.setDaemon(True) timer.start() ``` +### 6. Download the source code (tests/test_download.py) +If you give a link, the source is downloaded to the target directory through git clone or wget. + +#### How it works +1. Try git clone. +2. If git clone fails, download it with wget and extract the compressed file. +3. After extracting the compressed file, delete the compressed file. + +#### Parameters +| Parameter | Argument | Description | +| ------------- | ------------- | ------------- | +| h | None | Print help message. | +| s | String | Link to download. | +| t | String | Path to download and extract. | +| d | String | Path to save a log file. | + +#### How to run +``` +$ fosslight_download -s "https://github.com/LGE-OSS/example" -t target_dir/ +``` + ## 👏 How to report issue Please report any ideas or bugs to improve by creating an issue in [fosslight_util repository][cl]. Then there will be quick bug fixes and upgrades. Ideas to improve are always welcome. diff --git a/requirements.txt b/requirements.txt index abbd040..de3cab8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ progress PyYAML lastversion coloredlogs +pygit2 +python3-wget +beautifulsoup4 diff --git a/setup.py b/setup.py index d1e9bba..9a4a7eb 100644 --- a/setup.py +++ b/setup.py @@ -30,5 +30,10 @@ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", ], - install_requires=required + install_requires=required, + entry_points={ + "console_scripts": [ + "fosslight_download = fosslight_util.download:main", + ] + } ) diff --git a/src/fosslight_util/_get_downloadable_url.py b/src/fosslight_util/_get_downloadable_url.py new file mode 100755 index 0000000..760a91e --- /dev/null +++ b/src/fosslight_util/_get_downloadable_url.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (c) 2020 LG Electronics Inc. +# SPDX-License-Identifier: Apache-2.0 +import logging +import re +from bs4 import BeautifulSoup +from urllib.request import urlopen +import fosslight_util.constant as constant + +logger = logging.getLogger(constant.LOGGER_NAME) + + +def get_downloadable_url(link): + + ret = False + new_link = '' + + link = link.replace('http://', '') + link = link.replace('https://', '') + + if link.startswith('pypi.org/'): + ret, new_link = get_download_location_for_pypi(link) + elif link.startswith('mvnrepository.com/artifact/') or link.startswith('repo1.maven.org/'): + ret, new_link = get_download_location_for_maven(link) + elif link.startswith('www.npmjs.com/') or link.startswith('registry.npmjs.org'): + ret, new_link = get_download_location_for_npm(link) + elif link.startswith('pub.dev/'): + ret, new_link = get_download_location_for_pub(link) + + return ret, new_link + + +def get_download_location_for_pypi(link): + # get the url for downloading source file in pypi.org/project/(oss_name)/(oss_version)/#files + ret = False + new_link = '' + + try: + dn_loc_re = re.findall(r'pypi.org\/project\/?([^\/]*)\/?([^\/]*)', link) + oss_name = dn_loc_re[0][0] + oss_version = dn_loc_re[0][1] + + pypi_url = 'https://pypi.org/project/' + oss_name + '/' + oss_version + '/#files' + + content = urlopen(pypi_url).read().decode('utf8') + bs_obj = BeautifulSoup(content, 'html.parser') + + tr_list = bs_obj.find('div', {'id': 'files'}).findAll('tr') + for i in tr_list: + td = i.findAll('td') + for td_i in td: + str_i = str(td_i).replace('\n', ' ') + if re.findall(r'File type[\s]*(Source)[\s]*', str_i): + new_link = i.find('a').attrs['href'] + ret = True + break + except Exception as error: + ret = False + logger.warning('Cannot find the link for pypi (url:'+link+') '+str(error)) + + return ret, new_link + + +def get_download_location_for_maven(link): + # get the url for downloading source file in + # repo1.maven.org/maven2/(group_id(split to separator '/'))/(artifact_id)/(oss_version) + ret = False + new_link = '' + + try: + if link.startswith('mvnrepository.com/artifact/'): + dn_loc_split = link.replace('mvnrepository.com/', '').split('/') + group_id = dn_loc_split[1].replace('.', '/') + dn_loc = 'https://repo1.maven.org/maven2/' + group_id + '/' + dn_loc_split[2] + '/' + dn_loc_split[3] + + elif link.startswith('repo1.maven.org/maven2/'): + dn_loc_split = link.replace('repo1.maven.org/maven2/', '').split('/') + + if link.endswith('.tar.gz') or link.endswith('.jar') or link.endswith('.tar.xz'): + new_link = 'https://' + link + ret = True + return ret, new_link + else: + dn_loc = 'https://' + link + else: + raise Exception("not valid url for maven") + + html = urlopen(dn_loc).read().decode('utf8') + bs_obj = BeautifulSoup(html, 'html.parser') + + file_name = dn_loc.split('/')[-2] + '-' + dn_loc.split('/')[-1] + '-sources.jar' + + for link in bs_obj.findAll("a"): + if link.text == file_name: + source_url = link['href'] + new_link = dn_loc + '/' + source_url + break + elif link['href'].endswith('sources.jar') or link['href'].endswith('source.jar') or link['href'].endswith('src.jar'): + source_url = link['href'] + new_link = dn_loc + '/' + source_url + + if new_link != '': + ret = True + + except Exception as error: + ret = False + logger.warning('Cannot find the link for maven (url:'+link+') '+str(error)) + + return ret, new_link + + +def get_download_location_for_npm(link): + # url format : registry.npmjs.org/packagename/-/packagename-version.tgz + ret = False + new_link = '' + + try: + if link.startswith('www.npmjs.com/') or link.startswith('registry.npmjs.org'): + dn_loc_split = link.split('/') + + if dn_loc_split[1] == 'package': + idx = 2 + else: + idx = 1 + + if dn_loc_split[idx].startswith('@'): + oss_name_npm = dn_loc_split[idx]+'/'+dn_loc_split[idx+1] + tar_name = dn_loc_split[idx+1] + '-' + dn_loc_split[idx+3] + else: + oss_name_npm = dn_loc_split[idx] + tar_name = oss_name_npm + '-' + dn_loc_split[idx+2] + + new_link = 'https://registry.npmjs.org/' + oss_name_npm + '/-/' + tar_name + '.tgz' + ret = True + + except Exception as error: + ret = False + logger.warning('Cannot find the link for npm (url:'+link+') '+str(error)) + + return ret, new_link + + +def get_download_location_for_pub(link): + ret = False + new_link = '' + + # url format : https://pub.dev/packages/(oss_name)/versions/(oss_version) + # download url format : https://storage.googleapis.com/pub-packages/packages/(oss_name)-(oss_version).tar.gz + try: + if link.startswith('pub.dev/packages'): + dn_loc_split = link.split('/') + oss_name_pub = dn_loc_split[2] + oss_version_pub = dn_loc_split[4] + + new_link = 'https://storage.googleapis.com/pub-packages/packages/' + oss_name_pub + '-' + oss_version_pub + '.tar.gz' + ret = True + + except Exception as error: + ret = False + logger.warning('Cannot find the link for npm (url:'+link+') '+str(error)) + + return ret, new_link diff --git a/src/fosslight_util/download.py b/src/fosslight_util/download.py new file mode 100755 index 0000000..11fc09a --- /dev/null +++ b/src/fosslight_util/download.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (c) 2020 LG Electronics Inc. +# SPDX-License-Identifier: Apache-2.0 +import os +import sys +import wget +import tarfile +import zipfile +import logging +import getopt +import shutil +import pygit2 as git +import bz2 +from datetime import datetime +from pathlib import Path +from ._get_downloadable_url import get_downloadable_url +import fosslight_util.constant as constant +from fosslight_util.set_log import init_log +import signal + +logger = logging.getLogger(constant.LOGGER_NAME) +compression_extension = {".tar.bz2", ".tar.gz", ".tar.xz", ".tgz", ".tar", ".zip", ".jar", ".bz2"} +SIGNAL_TIMEOUT = 600 + + +class TimeOutException(Exception): + pass + + +def alarm_handler(signum, frame): + logger.warning("git clone timeout! (%d sec)", SIGNAL_TIMEOUT) + raise TimeOutException() + + +def print_help_msg(): + print("* Required : -s link_to_download") + print("* Optional : -t target_directory") + print("* Optional : -d log_file_directory") + sys.exit() + + +def main(): + + src_link = "" + target_dir = os.getcwd() + log_dir = os.getcwd() + + try: + argv = sys.argv[1:] + opts, args = getopt.getopt(argv, 'hs:t:d:') + except getopt.GetoptError: + print_help_msg() + + for opt, arg in opts: + if opt == "-h": + print_help_msg() + elif opt == "-s": + src_link = arg + elif opt == "-t": + target_dir = arg + elif opt == "-d": + log_dir = arg + + if src_link == "": + print_help_msg() + else: + cli_download_and_extract(src_link, target_dir, log_dir) + + +def cli_download_and_extract(link, target_dir, log_dir, checkout_to="", compressed_only=False): + global logger + + success = True + msg = "" + log_file_name = "fosslight_download_" + \ + datetime.now().strftime('%Y%m%d_%H-%M-%S')+".txt" + logger, log_item = init_log(os.path.join(log_dir, log_file_name)) + + try: + if link == "": + success = False + msg = "Need a link to download." + elif os.path.isfile(target_dir): + success = False + msg = "The target directory exists as a file.:"+target_dir + else: + if not download_git_clone(link, target_dir, checkout_to): + if os.path.isfile(target_dir): + shutil.rmtree(target_dir) + + success, downloaded_file = download_wget(link, target_dir, compressed_only) + if success: + success = extract_compressed_file(downloaded_file, target_dir, True) + except Exception as error: + success = False + msg = str(error) + + logger.info("* FOSSLight Downloader - Result :"+str(success)+"\n"+msg) + return success, msg + + +def get_ref_to_checkout(checkout_to, ref_list): + ref_to_checkout = checkout_to + try: + checkout_to = checkout_to.strip() + if checkout_to in ref_list: + return checkout_to + + prefix_refs = ["refs/remotes/origin/", "refs/tags/"] + for prefix in prefix_refs: + ref_to_checkout = prefix+checkout_to + if ref_to_checkout in ref_list: + return ref_to_checkout + + ref_to_checkout = next( + x for x in ref_list if x.endswith(checkout_to)) + except Exception as error: + logger.warning("git find ref - failed:"+str(error)) + return ref_to_checkout + + +def download_git_clone(git_url, target_dir, checkout_to=""): + signal.signal(signal.SIGALRM, alarm_handler) + signal.alarm(SIGNAL_TIMEOUT) + try: + Path(target_dir).mkdir(parents=True, exist_ok=True) + repo = git.clone_repository(git_url, target_dir, + bare=False, repository=None, + remote=None, callbacks=None) + signal.alarm(0) + except Exception as error: + logger.warning("git clone - failed:"+str(error)) + return False + try: + ref_to_checkout = checkout_to + if checkout_to != "": + ref_list = [x for x in repo.references] + ref_to_checkout = get_ref_to_checkout(checkout_to, ref_list) + logger.info("git checkout :"+ref_to_checkout) + repo.checkout(ref_to_checkout) + except Exception as error: + logger.warning("git checkout to "+ref_to_checkout + + " - failed:"+str(error)) + return True + + +def download_wget(link, target_dir, compressed_only): + success = False + downloaded_file = "" + + signal.signal(signal.SIGALRM, alarm_handler) + signal.alarm(SIGNAL_TIMEOUT) + try: + Path(target_dir).mkdir(parents=True, exist_ok=True) + + ret, new_link = get_downloadable_url(link) + if ret and new_link != "": + link = new_link + + if compressed_only: + for ext in compression_extension: + if link.endswith(ext): + success = True + break + else: + success = True + + if not success: + raise Exception('Not supported compression type (link:{0})'.format(link)) + + logger.info("wget:"+link) + downloaded_file = wget.download(link) + signal.alarm(0) + + shutil.move(downloaded_file, target_dir) + downloaded_file = os.path.join(target_dir, downloaded_file) + if downloaded_file != "": + success = True + logger.debug("wget - downloaded:"+downloaded_file) + except Exception as error: + success = False + logger.warning("wget - failed:"+str(error)) + + return success, downloaded_file + + +def extract_compressed_dir(src_dir, target_dir, remove_after_extract=True): + logger.debug("Extract Dir:"+src_dir) + try: + files_path = [os.path.join(src_dir, x) for x in os.listdir(src_dir)] + for fname in files_path: + extract_compressed_file(fname, target_dir, remove_after_extract) + except Exception as error: + logger.debug("Extract files in dir - failed:"+str(error)) + return False + return True + + +def extract_compressed_file(fname, extract_path, remove_after_extract=True): + try: + is_compressed_file = True + if os.path.isfile(fname): + if fname.endswith(".tar.bz2"): + decompress_bz2(fname, extract_path) + os.remove(fname) + fname = os.path.splitext(fname)[0] + + if fname.endswith(".tar.gz") or fname.endswith(".tgz"): + tar = tarfile.open(fname, "r:gz") + tar.extractall(path=extract_path) + tar.close() + elif fname.endswith(".tar.xz") or fname.endswith(".tar"): + tar = tarfile.open(fname, "r:*") + tar.extractall(path=extract_path) + tar.close() + elif fname.endswith(".zip") or fname.endswith(".jar"): + unzip(fname, extract_path) + elif fname.endswith(".bz2"): + decompress_bz2(fname, extract_path) + else: + is_compressed_file = False + logger.warning("Unsupported file extension:"+fname) + + if remove_after_extract and is_compressed_file: + logger.debug("Remove - extracted file :"+fname) + os.remove(fname) + else: + logger.warning("Not a file:"+fname) + except Exception as error: + logger.error("Extract - failed:"+str(error)) + return False + return True + + +def decompress_bz2(source_file, dest_path): + try: + fzip = bz2.BZ2File(source_file) + data = fzip.read() # get the decompressed data + open(os.path.splitext(source_file)[0], 'wb').write(data) # write a uncompressed file + + except Exception as error: + logger.error("Decompress bz2 - failed:"+str(error)) + return False + return True + + +def unzip(source_file, dest_path): + try: + fzip = zipfile.ZipFile(source_file, 'r') + for filename in fzip.namelist(): + fzip.extract(filename, dest_path) + fzip.close() + except Exception as error: + logger.error("Unzip - failed:"+str(error)) + return False + return True + + +if __name__ == '__main__': + main() diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100755 index 0000000..9fe3c59 --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (c) 2021 LG Electronics Inc. +# SPDX-License-Identifier: Apache-2.0 +from fosslight_util.download import cli_download_and_extract + + +def main(): + cli_download_and_extract("https://github.com/LGE-OSS/example", "test_result/download", "test_result/download_log") + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini index f0d9a8d..9f6c807 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,7 @@ exclude = .tox/* [pytest] filterwarnings = ignore::DeprecationWarning +norecursedirs = test_result/* [testenv:test_run] commands = @@ -34,6 +35,14 @@ commands = cat test_result/excel/FOSSLight-Report_SRC.csv cat test_result/excel/FOSSLight-Report_BIN_TEST.csv cat test_result/excel/FOSSLight-Report_CUSTOM_HEADER_SHEET.csv + # Test - downloading source + fosslight_download -s "https://github.com/LGE-OSS/example" -t test_git/ + ls test_git/ + fosslight_download -s "https://pypi.org/project/fosslight-dependency/3.0.5/" -t test_wget -d test_logs + fosslight_download -s "https://www.npmjs.com/package/json-schema/v/0.3.0" -t test_wget -d test_logs + fosslight_download -s "https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind/2.12.2" -t test_wget -d test_logs + fosslight_download -s "https://pub.dev/packages/file/versions/5.2.1" -t test_wget -d test_logs + ls test_wget/ [testenv:release] deps = @@ -57,5 +66,8 @@ commands = cat test_result/excel/FOSSLight-Report_CUSTOM_HEADER_SHEET.csv # Test - timer python tests/test_timer.py + # Test - downloading source + python tests/test_download.py + ls test_result/download # Test - check PEP8 pytest -v --flake8