In [1]:
import os
from time import sleep
from bs4 import BeautifulSoup
from requests import get
from model.model import ApplicationInfo, PackageSummary
from typing import List
import random
import csv
def save(content, path):
    print("Saving", f"/{path}")
    dirname = os.path.dirname(path)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(path, 'wb') as file:
        file.write(content)

# Download and save
root_page = "https://f-droid.org"
RANDOM_STATE = 15

In [2]:
def page_has_next(soup: BeautifulSoup):
    browse_navigation = soup.find("ul", {"class": "browse-navigation"})
    return False if browse_navigation.select_one("li.next.disabled") else True

In [3]:
def save_categories():
    resp = get(f"{root_page}/en/packages/", timeout=5)
    html = resp.text
    soup = BeautifulSoup(html, features='lxml')
    all_links = []
    for link in soup.find_all('a'):
        all_links.append(link.get('href'))
    # filter by category
    categories = [link for link in all_links if 'categories' in link]
    print(categories)
    for cat in categories:
        soup = hit_and_save_category(cat)
        current_page = 1
        while page_has_next(soup):
            current_page += 1
            soup = hit_and_save_category(cat, current_page)

def wait_and_jitter(seconds):
    # wait for some seconds + a little jitter
    # that goes from 0 to half the seconds provided.
    rng = random.Random(RANDOM_STATE)
    jitter = seconds * 0.5 * rng.random()
    seconds_and_jitter = seconds + jitter
    sleep(seconds_and_jitter)

def hit_and_save_category(cat, page=1):
    wait_and_jitter(2)
    category_name = cat.split("/")[-2]
    file_name = category_name
    if page > 1:
        page_to_hit = f"{root_page}/en/categories/{category_name}/{page}"
        file_name += f"_{page}"
    else:
        page_to_hit = f"{root_page}{cat}"
    print(f"Hitting {page_to_hit}")
    resp = get(page_to_hit, timeout=10)
    soup = BeautifulSoup(markup=resp.text, features='lxml')
    save(soup.prettify(encoding='utf-8'), f"categories/{category_name}/{file_name}.html")
    return soup

In [4]:
# save_categories()

In [5]:
def get_applications_from_category_list(html: BeautifulSoup) -> List[dict]:
    package_list = html.find("div", {"id": "package-list"})
    package_list = package_list.find_all("a", {"class": "package-header"})
    parsed_packages = []
    for package in package_list:
        if isinstance(package, str):
            continue
        name = package.find("h4").get_text().strip()
        link = f"{root_page}{package.get('href').strip()}"
        summary = package.find("span").get_text().strip()
        package_name = link.split("/")[-2]
        parsed_packages.append(dict(name=name, summary=summary, link=link, package=package_name))
    return parsed_packages

def get_package_info(html: BeautifulSoup):
    author_name = None
    author_contact = None
    license = None
    license_page = None
    website = None
    issue_tracker = None
    source_code = None
    build_metadata = None
    for li in html.find_all("li"):
        text = li.get_text()
        if "author" in text.lower():
            author_contact = li.find("a").get("href")
            author_name = [t for t in text.replace("\t", "").split("\n") if t][-1]
        elif "license" in text.lower():
            license = [t for t in text.replace("\t", "").split("\n") if t][-1]
            license_page = li.find("a").get("href")
        else:
            anchor = li.find("a")
            anchor_link = anchor.get("href")
            anchor_text = anchor.get_text()
            if "website" in anchor_text.lower():
                website = anchor_link
            elif "issue tracker" in anchor_text.lower():
                issue_tracker = anchor_link
            elif "source code" in anchor_text.lower():
                source_code = anchor_link
            elif "build metadata" in anchor_text.lower():
                build_metadata = anchor_link
    return {
        "author": author_name,
        "author_contact": author_contact,
        "license": license,
        "license_page": license_page,
        "website": website,
        "issue_tracker": issue_tracker,
        "source_code": source_code,
        "build_metadata": build_metadata
    }

def get_application_info_from_package_page(html) -> dict:
    package_links = html.find("ul", {"class": "package-links"})
    info = get_package_info(package_links)
    return info

In [6]:
def extract_apps_and_save():
    packages: List[PackageSummary] = []
    for dirpath, cat_dirs, filenames in os.walk("categories"):
        htmls_filenames = [f for f in filenames if f.endswith(".html")]
        category = dirpath.split("/")[-1]
        print("extracting", category)
        if not htmls_filenames:
            continue
        for filename in htmls_filenames:
            file_to_open = f"{dirpath}/{filename}"
            with open(file_to_open, "r") as html_file:
                soup = BeautifulSoup(html_file.read(), features="lxml")
                try:
                    applications = get_applications_from_category_list(soup)
                except:
                    print("Error on file: ", file_to_open)
                    raise "!"
                pac = [
                    PackageSummary(
                        app["name"], app["package"], app["summary"], category, app["link"]
                    )
                    for app in applications
                ]
                packages.extend(pac)
    # save csv
    print("Saving CSV...")
    path = "output/fdroid_apps_list.csv"
    dirname = os.path.dirname(path)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(path, "w") as csv_file:
        fields = ["name", "package", "summary", "category", "link"]
        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()
        for package in packages:
            writer.writerow(package.to_dict())


In [7]:
def get_complete_info(start_after: str = ""):
    first_line = True
    buffer = []
    with open("output/fdroid_apps_list.csv") as csv_file:
        apps = csv.DictReader(csv_file)
        should_skip = True if start_after else False
        for app in apps:
            # skip all apps until start_from
            if should_skip:
                first_line = False
                print("skipping", app["package"])
                if app["package"] == start_after:
                    should_skip = False
                continue
            # hit page
            print("Hitting package", app["package"], f"({app['link']})")
            resp = get(app["link"], timeout=5)
            if not resp.ok:
                print("Response NOT OK for", app["link"])
                raise Exception("NOT OK")
            html = resp.text
            soup = BeautifulSoup(html, features="lxml")
            info = get_application_info_from_package_page(soup)
            merged = app | info
            buffer.append(merged)
            if len(buffer) >= 10:
                with open("output/fdroid_apps_list_complete.csv", "a") as csv_file:
                    print("Saving CSV...")
                    example = buffer[0]
                    writer = csv.DictWriter(csv_file, fieldnames=example.keys())
                    if first_line == True:
                        first_line = False
                        writer.writeheader()
                    writer.writerows(buffer)
                    buffer.clear()
            wait_and_jitter(1)
    print("Done")


In [8]:
get_complete_info(start_after="eu.veldsoft.tri.peaks")

skipping net.bitconomy.ckpoolwatcher
skipping com.mattallen.loaned
skipping com.repay.android
skipping org.moparisthebest.pageplus
skipping com.veken0m.bitcoinium
skipping wb.receiptspro
skipping ee.smkv.calc.loan
skipping org.tryton.client
skipping com.nanoconverter.zlab
skipping org.billthefarmer.specie
skipping org.billthefarmer.currency
skipping hashengineering.groestlcoin.wallet_test
skipping hashengineering.groestlcoin.wallet
skipping com.tombursch.kitchenowl
skipping net.stargw.fx
skipping com.agoradesk.app
skipping co.localmonero.app
skipping com.igisw.openmoneybox
skipping com.coinerella.peercoin
skipping me.hackerchick.catima
skipping io.horizontalsystems.bankwallet
skipping org.encointer.wallet
skipping org.totschnig.myexpenses
skipping com.invoiceninja.app
skipping com.cosmos.candle
skipping net.taler.wallet.fdroid
skipping com.btcontract.wallet
skipping com.starry.greenstash
skipping de.salomax.currencies
skipping de.chaosdorf.meteroid
skipping ua.com.radiokot.lnaddr2invoi

In [29]:
with open("fdroid.html", 'r') as html_file:
    soup = BeautifulSoup(html_file, features='lxml')
    example = get_application_info_from_package_page(soup)
example

{'author': 'SECUSO - Security Usability Society',
 'author_contact': 'mailto:contact@secuso.org?subject=F-Droid - Free and Open Source Android App Repository%20on%20',
 'license': 'GNU General Public License v3.0 or later',
 'license_page': 'https://www.gnu.org/licenses/gpl-3.0-standalone.html',
 'website': 'https://www.secuso.informatik.tu-darmstadt.de/index.php?id=11404&L=0',
 'issue_tracker': 'https://github.com/SecUSo/privacy-friendly-tape-measure/issues',
 'source_code': 'https://github.com/SecUSo/privacy-friendly-tape-measure',
 'build_metadata': 'https://gitlab.com/fdroid/fdroiddata/tree/master/metadata/org.secuso.privacyfriendlytapemeasure.yml'}