converged-computing · vsoch · May 31, 2023 · May 30, 2023 · May 30, 2023 · May 30, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,18 @@
+# CHANGELOG
+
+This is a manually generated log to track changes to the repository for each release.
+Each section should include general headers such as **Implemented enhancements**
+and **Merged pull requests**. Critical items to know are:
+
+ - renamed commands
+ - deprecated / removed commands
+ - changed defaults
+ - backward incompatible changes
+ - migration guidance
+ - changed behaviour
+
+The versions coincide with releases on pip. Only major versions will be released as tags on Github.
+
+## [0.0.x](https://github.com/converged-computing/flux-metrics-api/tree/main) (0.0.x)
+ - Support for certificates for uvicorn and change default port to 8443 (0.0.1)
+ - Skelton release (0.0.0)
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 FROM fluxrm/flux-sched:focal
 
 # docker build -t flux_metrics_api .
-# docker run -it -p 8080:8080 flux_metrics_api
+# docker run -it -p 8443:8443 flux_metrics_api
 
 LABEL maintainer="Vanessasaurus <@vsoch>"
 

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ You'll want to be running in a Flux instance, as we need to connect to the broke
 $ flux start --test-size=4
 ```
 
-And then start the server. This will use a default port and host (0.0.0.0:8080) that you can customize
+And then start the server. This will use a default port and host (0.0.0.0:8443) that you can customize
 if desired.
 
 ```bash
@@ -56,6 +56,12 @@ $ flux-metrics-api start
 $ flux-metrics-api start --port 9000 --host 127.0.0.1
 ```
 
+If you want ssl (port 443) you can provide the path to a certificate and keyfile:
+
+```bash
+$ flux-metrics-api start --ssl-certfile /etc/certs/tls.crt --ssl-keyfile /etc/certs/tls.key
+```
+
 See `--help` to see other options available.
 
 ### Endpoints
@@ -67,7 +73,7 @@ See `--help` to see other options available.
 Here is an example to get the "node_up_count" metric:
 
 ```bash
- curl -s http://localhost:8080/apis/custom.metrics.k8s.io/v1beta2/namespaces/flux-operator/metrics/node_up_count | jq
+ curl -s http://localhost:8443/apis/custom.metrics.k8s.io/v1beta2/namespaces/flux-operator/metrics/node_up_count | jq
 ```
 ```console
 {
@@ -101,15 +107,20 @@ be a demo. You can either build it yourself, or use our build.
 
 ```bash
 $ docker build -t flux_metrics_api .
-$ docker run -it -p 8080:8080 flux_metrics_api
+$ docker run -it -p 8443:8443 flux_metrics_api
 ```
 or
 
 ```bash
-$ docker run -it -p 8080:8080 ghcr.io/converged-computing/flux-metrics-api
+$ docker run -it -p 8443:8443 ghcr.io/converged-computing/flux-metrics-api
 ```
 
-You can then open up the browser at [http://localhost:8080/metrics/](http://localhost:8080/metrics) to see
+### Development
+
+Note that this is implemented in Python, but (I found this after) we could [also use Go](https://github.com/kubernetes-sigs/custom-metrics-apiserver).
+Specifically, I found this repository useful to see the [spec format](https://github.com/kubernetes-sigs/custom-metrics-apiserver/blob/master/pkg/generated/openapi/custommetrics/zz_generated.openapi.go).
+
+You can then open up the browser at [http://localhost:8443/metrics/](http://localhost:8443/metrics) to see
 the metrics!
 
 ## 😁️ Contributors 😁️

diff --git a/flux_metrics_api/apis.py b/flux_metrics_api/apis.py
@@ -0,0 +1,54 @@
+# Copyright 2023 Lawrence Livermore National Security, LLC and other
+# HPCIC DevTools Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (MIT)
+
+import json
+import os
+import subprocess
+
+import flux_metrics_api.defaults as defaults
+import flux_metrics_api.utils as utils
+
+# Global cache of responses
+cache = {}
+
+
+def get_kubernetes_endpoint(endpoint):
+    """
+    Get an endpoint from the cluster.
+    """
+    if defaults.USE_CACHE and endpoint in cache:
+        return cache[endpoint]
+
+    # Point to the internal API server hostname
+    api_server = "https://kubernetes.default.svc"
+
+    # Path to ServiceAccount directory
+    sa_account_dir = "/var/run/secrets/kubernetes.io/serviceaccount"
+    namespace_file = os.path.join(sa_account_dir, "namespace")
+    cert_file = os.path.join(sa_account_dir, "ca.crt")
+    token_file = os.path.join(sa_account_dir, "token")
+
+    # Cut out early if we aren't running in the pod
+    if not all(
+        map(os.path.exists, [sa_account_dir, namespace_file, token_file, cert_file])
+    ):
+        return {}
+
+    # Get the token to do the request
+    token = utils.read_file(token_file)
+
+    # Using subprocess to not add extra dependency - yes requires curl
+    # res = requests.get(f"{api_server}/apis", headers=headers, verify=cert_file)
+    # Kids don't do this at home
+    output = subprocess.check_output(
+        f'curl --cacert {cert_file} --header "Authorization: Bearer {token}" -X GET {api_server}/{endpoint}',
+        shell=True,
+    )
+    try:
+        output = json.loads(output)
+        cache[endpoint] = output
+    except Exception:
+        return {}
+    return output
diff --git a/flux_metrics_api/defaults.py b/flux_metrics_api/defaults.py
@@ -3,6 +3,16 @@
 #
 # SPDX-License-Identifier: (MIT)
 
-API_VERSION = "custom.metrics.k8s.io/v1beta2"
+API_ENDPOINT = "custom.metrics.k8s.io/v1beta2"
 API_ROOT = "/apis/custom.metrics.k8s.io/v1beta2"
-NAMESPACES = None
+NAMESPACE = "flux-operator"
+SERVICE_NAME = "custom-metrics-apiserver"
+USE_CACHE = True
+
+
+def API_VERSION():
+    """
+    Derive the api version from the endpoint
+    """
+    global API_ENDPOINT
+    return API_ENDPOINT.rstrip("/").rsplit("/")[-1]
diff --git a/flux_metrics_api/routes.py b/flux_metrics_api/routes.py
@@ -3,33 +3,42 @@
 #
 # SPDX-License-Identifier: (MIT)
 
+from apispec import APISpec
 from starlette.endpoints import HTTPEndpoint
 from starlette.responses import JSONResponse
 from starlette.routing import Route
-from starlette.schemas import SchemaGenerator
+from starlette_apispec import APISpecSchemaGenerator
 
 import flux_metrics_api.defaults as defaults
 import flux_metrics_api.types as types
 import flux_metrics_api.version as version
 from flux_metrics_api.metrics import metrics
 
-schemas = SchemaGenerator(
-    {
-        "openapi": "3.0.0",
-        "info": {"title": "Flux Metrics API", "version": version.__version__},
-    }
+schemas = APISpecSchemaGenerator(
+    APISpec(
+        title="Flux Metrics API",
+        version=version.__version__,
+        openapi_version="3.0.0",
+        info={"description": "Export Flux custom metrics."},
+    )
+)
+
+not_found_response = JSONResponse(
+    {"detail": "The metric server is not running in a Kubernetes pod."},
+    status_code=404,
 )
 
 
 class Root(HTTPEndpoint):
     """
     Root of the API
 
-    This needs to return 200 for a health check
+    This needs to return 200 for a health check. I later discovered it also needs
+    to return the listing of available metrics!
     """
 
     async def get(self, request):
-        return JSONResponse({})
+        return JSONResponse(types.new_resource_list())
 
 
 def get_metric(request):
@@ -41,18 +50,13 @@ def get_metric(request):
     """
     metric_name = request.path_params["metric_name"]
     namespace = request.path_params.get("namespace")
+    print(f"Requested metric {metric_name} in  namespace {namespace}")
 
-    if (
-        namespace is not None
-        and defaults.NAMESPACES is not None
-        and namespace not in defaults.NAMESPACES
-    ):
-        return JSONResponse(
-            {"detail": "This namespace is not known to the server."}, status_code=404
-        )
-
+    # TODO we don't do anything with namespace currently, we assume we won't
+    # be able to hit this if running in the wrong one
     # Unknown metric
     if metric_name not in metrics:
+        print(f"Unknown metric requested {metric_name}")
         return JSONResponse(
             {"detail": "This metric is not known to the server."}, status_code=404
         )
@@ -63,7 +67,10 @@ def get_metric(request):
     # Get the value from Flux, assemble into listing
     value = metrics[metric_name]()
     metric_value = types.new_metric(metric, value=value)
-    listing = types.new_metric_list([metric_value])
+
+    # Give the endpoint for the service as metadata
+    metadata = {"selfLink": defaults.API_ROOT}
+    listing = types.new_metric_list([metric_value], metadata=metadata)
     return JSONResponse(listing)
 
 
@@ -79,24 +86,50 @@ async def get(self, request):
         return get_metric(request)
 
 
+class APIGroupList(HTTPEndpoint):
+    """
+    Service a faux resource list just for our custom metrics endpoint.
+    """
+
+    async def get(self, request):
+        listing = types.new_group_list()
+        if not listing:
+            return not_found_response
+        return JSONResponse(listing)
+
+
+class OpenAPI(HTTPEndpoint):
+    """
+    Forward the cluster openapi endpoint
+    """
+
+    async def get(self, request):
+        version = request.path_params["version"]
+        openapi = types.get_cluster_schema(version)
+        if not openapi:
+            return not_found_response
+        return JSONResponse(openapi)
+
+
 def openapi_schema(request):
     """
     Get the openapi spec from the endpoints
-
-    TODO: debug why paths empty
     """
     return JSONResponse(schemas.get_schema(routes=routes))
 
 
 # STOPPED HERE - make open api spec s we can see endpoints and query
 routes = [
     Route(defaults.API_ROOT, Root),
-    # Optional for openapi, we could add if needed
+    # This is a faux route so we can get the preferred resource version
+    Route("/apis", APIGroupList),
+    Route("/openapi/{version}", OpenAPI),
     Route(defaults.API_ROOT + "/namespaces/{namespace}/metrics/{metric_name}", Metric),
     Route(defaults.API_ROOT + "/{resource}/{name}/{metric_name}", Metric),
     Route(
         defaults.API_ROOT + "/namespaces/{namespace}/{resource}/{name}/{metric_name}",
         Metric,
     ),
+    # Route("/openapi/v2", openapi_schema, include_in_schema=False),
     Route(f"{defaults.API_ROOT}/openapi/v2", openapi_schema, include_in_schema=False),
 ]
diff --git a/flux_metrics_api/server.py b/flux_metrics_api/server.py
@@ -65,20 +65,20 @@ def get_parser():
     )
     start.add_argument(
         "--port",
-        help="Port to run application",
-        default=8080,
+        help="Port to run application (defaults to 8443)",
+        default=8443,
         type=int,
     )
+    start.add_argument("--namespace", help="Namespace the API is running in")
     start.add_argument(
-        "--namespace", help="Scope to running in these namespace(s)", action="append"
+        "--service-name", help="Service name the metrics service is running from"
     )
     start.add_argument(
         "--api-path",
         dest="api_path",
         help="Custom API path (defaults to /apis/custom.metrics.k8s.io/v1beta2)",
         default=None,
     )
-
     start.add_argument(
         "--host",
         help="Host address to run application",
@@ -90,15 +90,35 @@ def get_parser():
         default=False,
         action="store_true",
     )
+    start.add_argument(
+        "--no-cache",
+        help="Do not cache Kubernetes API responses.",
+        default=False,
+        action="store_true",
+    )
+    start.add_argument("--ssl-keyfile", help="full path to ssl keyfile")
+    start.add_argument("--ssl-certfile", help="full path to ssl certfile")
     return parser
 
 
 def start(args):
     """
     Start the server with uvicorn
     """
+    # Validate certificates if provided
+    if args.ssl_keyfile and not args.ssl_certfile:
+        sys.exit("A --ssl-keyfile was provided without a --ssl-certfile.")
+    if args.ssl_certfile and not args.ssl_keyfile:
+        sys.exit("A --ssl-certfile was provided without a --ssl-keyfile.")
+
     app = Starlette(debug=args.debug, routes=routes)
-    uvicorn.run(app, host=args.host, port=args.port)
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+    )
 
 
 def main():
@@ -131,14 +151,22 @@ def help(return_code=0):
     )
 
     # Setup the registry - non verbose is default
-    print(f"API endpoint is at {defaults.API_ROOT}")
     if args.api_path is not None:
-        print(f"Setting API endpoint to {args.api_path}")
         defaults.API_ROOT = args.api_path
+    print(f"API endpoint is at {defaults.API_ROOT}")
 
-    # Limit to specific namespaces?
+    # Do not cache responses
+    if args.no_cache is True:
+        defaults.USE_CACHE = False
+
+    # Set namespace or service name to be different than defaults
     if args.namespace:
-        defaults.NAMESPACES = args.namespace
+        defaults.NAMESPACE = args.namespace
+    print(f"Running from namespace {defaults.NAMESPACE}")
+
+    if args.service_name:
+        defaults.SERVICE_NAME = args.service_name
+    print(f"Service name {defaults.SERVICE_NAME}")
 
     # Does the user want a shell?
     if args.command == "start":