# V. LFwC: A New Corpus to Demonstrate the Practicability of the Proposed Requirements

We created a Linux Firmware Corpus (LFwC) to assess the practicability of our requirements. It is based on data until June 2023 and consists of 10,913 deduplicated and unpacked firmware images from ten known manufacturers. It includes both actual and historical firmware samples, covering 2,365
unique devices across 22 classes. To provide an overview of LFwC, we added corpus data points to the bottom Table II. We share as much data as legally possible and publish all scripts, tools, and virtual machines for replicability. We tear down LFwC’s unpacking barrier with an open source process
for verified unpacking success.

## Preparations

Below you will find preparatory stuff such as imports and constant definitions for use down the road.

### Imports

In [None]:
import json
from collections import deque
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import rc
from matplotlib.ticker import ScalarFormatter
from packaging.version import Version, parse

### Constants

In [None]:
CMAP: list[int] = deque(sns.color_palette("colorblind", as_cmap=True))
CMAP.rotate(-4)
CMAP = list(CMAP)

CMAP_2 = deque(CMAP.copy())
CMAP_2.rotate(1)
CMAP_2 = list(CMAP_2)

CORPUS_PATH: Path = Path("../public_data/lfwc-full.csv")
FIGURE_DEST: Path = Path("../figures")

Y_LABELS: list[str] = [
    "Ubiquiti",
    "TRENDnet",
    "NETGEAR",
    "Linksys",
    "EnGenius",
    "EDIMAX",
    "D-Link",
    "ASUS",
    "TP-Link",
    "AVM",
]

### Matplotlib Settings

In [None]:
rc("font", **{"family": "serif", "serif": ["Times"], "size": 15})
rc("text", usetex=True)
pd.set_option("display.max_colwidth", None)

### Read Data

In [None]:
df = pd.read_csv(CORPUS_PATH, index_col=0)

## Peek Into Raw Data

In [None]:
df

## Table III - LFwC: Corpus Statistics Overview

In [None]:
def corpus_statistics_overview(df: pd.DataFrame) -> pd.DataFrame:
    df_stats: pd.DataFrame = (
        df.groupby(["manufacturer"], as_index=False)
        .nunique()[["manufacturer", "sha256", "device_name"]]
        .rename(columns={"manufacturer": "Manufact.", "sha256": "Samples", "device_name": "Devices"})
    )

    df_stats["Mean Samples per Device"] = (df_stats["Samples"] / df_stats["Devices"]).round(2)
    df_stats["Mean Size per Sample"] = (
        df[["manufacturer", "compressed_firmware_size"]]
        .groupby(["manufacturer"], as_index=False)
        .mean()["compressed_firmware_size"]
        / 1024**2
    ).round(0)

    df_stats["Mean Files per Sample"] = (
        df[["manufacturer", "files_in_firmware"]].groupby(["manufacturer"], as_index=False).mean()["files_in_firmware"]
    ).round(2)

    return df_stats

In [None]:
df_stats: pd.DataFrame = corpus_statistics_overview(df)
df_stats

## Figure 7 - LFwC firmware distribution per release date. For 747 samples, our scrapers could not extract any release date from the sources.

In [None]:
def create_figure_7_firmware_distribution_per_release_date(df: pd.DataFrame) -> None:
    df_removed_day_from_date = df.copy()
    rc("font", **{"family": "serif", "serif": ["Times"], "size": 16})
    df_removed_day_from_date["release_date"] = (
        df_removed_day_from_date["release_date"].str.split("-").str[:-2].str.join("-")
    )
    df_history = (
        df_removed_day_from_date.groupby(["release_date", "manufacturer"], as_index=False)
        .nunique()
        .pivot(index="release_date", columns=["manufacturer"], values="md5")
        .fillna(value=0.0)
    )
    ax = df_history.plot(
        kind="bar",
        grid=True,
        stacked=True,
        logy=False,
        figsize=(8, 6),
        rot=50,
        legend=False,
        edgecolor="black",
        color=CMAP_2,
    )
    ax.set_xticklabels(["unk."] + [str(i) for i in range(2005, 2024)])
    ax.set_ylabel("Sample Quantity [\\#]")
    ax.set_xlabel("Release Year")
    ax.legend(ncols=4, bbox_to_anchor=(0.9475, 1.275), labels=Y_LABELS[::-1], fontsize=13)
    ax.set_axisbelow(True)
    ax.yaxis.set_major_formatter(ScalarFormatter())
    plt.tight_layout()
    plt.savefig(FIGURE_DEST / "f7_corpus_release_dates.pdf", bbox_inches="tight")
    rc("font", **{"family": "serif", "serif": ["Times"], "size": 15})
    plt.show()

In [None]:
create_figure_7_firmware_distribution_per_release_date(df)

## Figure 8 - Distribution of device classes in LFwC. 

The three most prevalent classes are routers (49%), switches (14%), and access points (12%). We bundled device classes with less than 150 samples into the meta class misc. It contains: controller, board, converter, encoder, gateway, kvm, media, nas, phone, power supply, printer, recorder, san, and wifi-usb.

In [None]:
def create_figure_8_distribution_of_device_classes_in_lfwc(df: pd.DataFrame) -> None:
    df_corpus_misc_classes = df.copy()

    flt = df_corpus_misc_classes["device_class"].str.contains(
        "controller|board|converter|encoder|gateway|kvm|media|nas|phone|power_supply|printer|recorder|san|wifi-usb"
    )
    df_corpus_misc_classes.loc[flt, "device_class"] = "misc"

    by_classes = (
        df_corpus_misc_classes.groupby(["device_class", "manufacturer"], as_index=False)
        .nunique()
        .pivot(index="device_class", columns=["manufacturer"], values="md5")
        .fillna(value=0.0)
    )

    rc("font", **{"family": "serif", "serif": ["Times"], "size": 18})
    ax = by_classes.plot(
        kind="bar",
        grid=True,
        stacked=False,
        logy=True,
        figsize=(20, 4.5),
        color=CMAP_2,
        edgecolor="black",
        legend=False,
        width=0.8,
        rot=0,
    )
    ax.set_axisbelow(True)
    ax.yaxis.set_major_formatter(ScalarFormatter())
    ax.set_ylabel("Sample Quantity [\\#, log]")
    ax.set_xlabel("Device Class")
    ax.set_xlim(-0.41, 8.49)
    ax.legend(ncols=10, bbox_to_anchor=(1.0375, 1.2), labels=Y_LABELS[::-1], fontsize=16)
    for i in range(0, 11):
        ax.axvline(i + 0.500, color="black", linewidth=1)
    plt.tight_layout()
    plt.savefig(FIGURE_DEST / "f8_corpus_classes.pdf", bbox_inches="tight")
    rc("font", **{"family": "serif", "serif": ["Times"], "size": 15})
    plt.show()

In [None]:
create_figure_8_distribution_of_device_classes_in_lfwc(df)

## Figure 9 - Detected Linux kernel banners in LFwC samples.

In [None]:
def create_figure_9_detected_linux_kernel_banners_in_lfwc_samples(df: pd.DataFrame) -> None:
    df_linux_prep = df.copy()
    linux_series = df_linux_prep[df_linux_prep["linux_banners"].notnull()]["linux_banners"].apply(
        lambda x: x.split("|")
    )
    df_linux_prep["linux_banners"] = linux_series
    df_linux_prep = df_linux_prep.explode("linux_banners", ignore_index=True)

    def bucketize(ver_str):
        if isinstance(ver_str, float):
            return parse("0.0")
        prepared = ver_str.split(" ")[-1].split(".")[0:2]
        ver = parse(".".join(prepared))
        return ver

    df_bucketize_version = df_linux_prep.copy()
    df_bucketize_version["linux_banners"] = df_bucketize_version["linux_banners"].apply(bucketize)
    df_bucketize_version
    rc("font", **{"family": "serif", "serif": ["Times"], "size": 22})
    df_linux_banners = (
        df_bucketize_version.groupby(["linux_banners", "manufacturer"], as_index=False)
        .nunique()
        .pivot(index="linux_banners", columns=["manufacturer"], values="md5")
        .fillna(value=0.0)
    )

    ax = df_linux_banners.plot(
        kind="barh",
        grid=True,
        stacked=True,
        logx=True,
        figsize=(21, 7),
        rot=0,
        legend=False,
        edgecolor="black",
        color=["grey"],
    )
    ax.set_ylabel(None)
    ax.set_xlabel("Detected Linux Kernel Version Banners [Grouped by Major.Minor, log]")
    ax.set_yticklabels(["unk."] + ax.get_yticklabels()[1:], ha="left", va="center", position=(-0.0275, 0))
    ax.set_axisbelow(True)
    for i in range(0, 19):
        x = df_linux_banners.iloc[i].sum()
        plt.text(x + 5, i, int(x), va="center")
    ax.xaxis.set_major_formatter(ScalarFormatter())
    plt.tight_layout()
    plt.savefig(FIGURE_DEST / "f9_corpus_linux_banners.pdf", bbox_inches="tight")
    plt.show()

In [None]:
create_figure_9_detected_linux_kernel_banners_in_lfwc_samples(df)

## Figure 10 - Distribution of the nine detected ISAs in LFwC across all vendors

The three most prevalent ISA families are MIPS (5,993 samples), ARM (4,764), and x86 (2,095). There are 13,429 unique findings on ISAs across all samples, because included subsystems must not run the same ISA as the main system.

In [None]:
def create_figure_10_isa_distribution(df: pd.DataFrame) -> None:
    df_arch_prep = df.copy()
    arch_series = df_arch_prep[df_arch_prep["elf_architectures"].notnull()]["elf_architectures"].apply(
        lambda x: x.split("|")
    )
    df_arch_prep["elf_architectures"] = arch_series
    df_arch_prep

    rc("font", **{"family": "serif", "serif": ["Times"], "size": 18})

    by_arch = df_arch_prep.explode("elf_architectures", ignore_index=True)
    by_arch = (
        by_arch.groupby(["elf_architectures", "manufacturer"], as_index=False)
        .nunique()
        .pivot(index="elf_architectures", columns=["manufacturer"], values="md5")
        .fillna(value=0.0)
    )
    # by_arch
    ax = by_arch.plot(
        kind="bar",
        grid=True,
        stacked=False,
        logy=True,
        figsize=(20, 4),
        color=CMAP_2,
        edgecolor="black",
        legend=False,
        width=0.8,
        rot=0,
    )
    ax.set_axisbelow(True)
    ax.yaxis.set_major_formatter(ScalarFormatter())
    ax.set_ylabel("Sample Quantity [\\#, log]")
    ax.set_xlabel("Detected Architecture")
    ax.set_xlim(-0.5, 8.5)
    ax.set_xticklabels(["ARM", "ESP", "M68K", "MIPS", "PPC", "RISCV", "s/390", "SPARC", "x86"])
    for i in range(0, 11):
        ax.axvline(i + 0.505, color="black", linewidth=1)
    plt.tight_layout()
    plt.savefig(FIGURE_DEST / "f10_corpus_architectures.pdf", bbox_inches="tight")
    rc("font", **{"family": "serif", "serif": ["Times"], "size": 15})
    plt.show()

In [None]:
create_figure_10_isa_distribution(df)

### Interactive

Now it's your time to play with the corpus! Are you comfortable with [pandas](https://pandas.pydata.org/docs/user_guide/index.html)? You can do some amazing stuff to query the data!

In [None]:
print("e.g., only show firmware samples where a MIPS architecture was found:")

df[
    df["elf_architectures"].  # take the "elf_architectures" row
    fillna("").               # replace all NULL values, where no architecture was found, with an empty string
    str.contains("mips")      # get all rows in the dataframe that contain the "mips" keyword in column "elf architecture"
]