Skip to content

Commit

Permalink
Github Collector Enhancements (#1566)
Browse files Browse the repository at this point in the history
* Initial Commit

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Included More features

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Included Service Poll

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Finishing Touches

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Fixed Lint

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Updated Based on Code Review

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Fixed Artifact Download

* The Github API returns a zip file with the artifacts, I thought that it was actualy a JSON, so I am now fixing it

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Fixed tests

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

* Updated Based on Code Review

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>

---------

Signed-off-by: nathannaveen <42319948+nathannaveen@users.noreply.github.com>
  • Loading branch information
nathannaveen committed Jan 29, 2024
1 parent b0969e3 commit 52a55e4
Show file tree
Hide file tree
Showing 7 changed files with 450 additions and 39 deletions.
110 changes: 88 additions & 22 deletions cmd/guaccollect/cmd/github.go
Expand Up @@ -19,8 +19,11 @@ import (
"context"
"fmt"
"os"
"strings"
"time"

"github.com/guacsec/guac/pkg/cli"

"github.com/guacsec/guac/internal/client/githubclient"
csubclient "github.com/guacsec/guac/pkg/collectsub/client"
"github.com/guacsec/guac/pkg/collectsub/datasource"
Expand All @@ -33,6 +36,12 @@ import (
"github.com/spf13/viper"
)

const (
githubMode = "github-mode"
githubSbom = "github-sbom"
githubWorkflowFile = "github-workflow-file"
)

type githubOptions struct {
// datasource for the collector
dataSource datasource.CollectSource
Expand All @@ -42,12 +51,24 @@ type githubOptions struct {
blobAddr string
// run as poll collector
poll bool
// the mode to run the collector in
githubMode string
// the name of the sbom file to look for
sbomName string
// the name of the workflow file to look for
workflowFileName string
// the owner/repo name to use for the collector
ownerRepoName string
}

var githubCmd = &cobra.Command{
Use: "github [flags] release_url1 release_url2...",
Use: "github if <github-mode> is \"release\" then [flags] release_url1 release_url2..., otherwise if <github-mode> is \"workflow\" then [flags] <owner>/<repo>",
Short: "takes github repos and tags to download metadata documents stored in Github releases to add to GUAC graph utilizing Nats pubsub and blob store",
Long: `
Takes github repos and tags to download metadata documents stored in Github releases to add to GUAC graph.
<github-mode> must be either "workflow", "release", or "". If "", then the default is "release".
if <github-mode> is "workflow", then <owner-repo> must be specified.
guaccollect github checks repos and tags to download metadata documents stored in Github releases. Ingestion to GUAC happens via an event stream (NATS)
to allow for decoupling of the collectors from the ingestion into GUAC.
Expand All @@ -69,6 +90,9 @@ you have access to read and write to the respective blob store.`,
opts, err := validateGithubFlags(
viper.GetString("pubsub-addr"),
viper.GetString("blob-addr"),
viper.GetString(githubMode),
viper.GetString(githubSbom),
viper.GetString(githubWorkflowFile),
viper.GetString("csub-addr"),
viper.GetBool("csub-tls"),
viper.GetBool("csub-tls-skip-verify"),
Expand All @@ -95,10 +119,27 @@ you have access to read and write to the respective blob store.`,
collectorOpts := []github.Opt{
github.WithCollectDataSource(opts.dataSource),
github.WithClient(ghc),
github.WithMode(opts.githubMode),
github.WithSbomName(opts.sbomName),
github.WithWorkflowName(opts.workflowFileName),
}
if opts.poll {
collectorOpts = append(collectorOpts, github.WithPolling(30*time.Second))
}

if opts.ownerRepoName != "" {
if !strings.Contains(opts.ownerRepoName, "/") {
logger.Errorf("owner-repo flag must be in the format <owner>/<repo>")
} else {
ownerRepoName := strings.Split(opts.ownerRepoName, "/")
if len(ownerRepoName) != 2 {
logger.Errorf("owner-repo flag must be in the format <owner>/<repo>")
}
collectorOpts = append(collectorOpts, github.WithOwner(ownerRepoName[0]))
collectorOpts = append(collectorOpts, github.WithRepo(ownerRepoName[1]))
}
}

githubCollector, err := github.NewGithubCollector(collectorOpts...)
if err != nil {
logger.Errorf("unable to create Github collector: %v", err)
Expand All @@ -112,11 +153,14 @@ you have access to read and write to the respective blob store.`,
},
}

func validateGithubFlags(pubsubAddr string, blobAddr string, csubAddr string, csubTls bool, csubTlsSkipVerify bool, useCsub bool, poll bool, args []string) (githubOptions, error) {
func validateGithubFlags(pubsubAddr, blobAddr, csubAddr, githubMode, sbomName, workflowFileName string, csubTls, csubTlsSkipVerify, useCsub, poll bool, args []string) (githubOptions, error) {
var opts githubOptions
opts.pubsubAddr = pubsubAddr
opts.blobAddr = blobAddr
opts.poll = poll
opts.githubMode = githubMode
opts.sbomName = sbomName
opts.workflowFileName = workflowFileName

if useCsub {
csubOpts, err := csubclient.ValidateCsubClientFlags(csubAddr, csubTls, csubTlsSkipVerify)
Expand All @@ -131,35 +175,57 @@ func validateGithubFlags(pubsubAddr string, blobAddr string, csubAddr string, cs
return opts, err
}

// else direct CLI call
if len(args) < 1 {
return opts, fmt.Errorf("expected positional argument(s) for release_url(s)")
}
// Otherwise direct CLI call

sources := []datasource.Source{}
for _, arg := range args {
// TODO (mlieberman85): Below should be a github url parser helper instead of in the github collector
if _, _, err := github.ParseGithubReleaseDataSource(datasource.Source{
Value: arg,
}); err != nil {
return opts, fmt.Errorf("release_url parsing error. require format https://github.com/<org>/<repo>/releases/<optional_tag>: %v", err)
if githubMode == "release" {
if len(args) < 1 {
return opts, fmt.Errorf("expected positional argument(s) for release_url(s)")
}
sources = append(sources, datasource.Source{
Value: arg,

sources := []datasource.Source{}
for _, arg := range args {
// TODO (mlieberman85): Below should be a github url parser helper instead of in the github collector
if _, _, err := github.ParseGithubReleaseDataSource(datasource.Source{
Value: arg,
}); err != nil {
return opts, fmt.Errorf("release_url parsing error. require format https://github.com/<org>/<repo>/releases/<optional_tag>: %v", err)
}
sources = append(sources, datasource.Source{
Value: arg,
})
}

var err error
opts.dataSource, err = inmemsource.NewInmemDataSources(&datasource.DataSources{
GithubReleaseDataSources: sources,
})
}
if err != nil {
return opts, err
}
} else {
if len(args) != 1 {
return opts, fmt.Errorf("expected positional argument for owner-repo in the format <owner>/<repo>")
}
opts.ownerRepoName = args[0]

var err error
opts.dataSource, err = inmemsource.NewInmemDataSources(&datasource.DataSources{
GithubReleaseDataSources: sources,
})
if err != nil {
return opts, err
if opts.ownerRepoName == "" {
return opts, fmt.Errorf("owner-repo argument must be in the format <owner>/<repo>")
}
}

return opts, nil
}

func init() {
set, err := cli.BuildFlags([]string{githubMode, githubSbom, githubWorkflowFile})
if err != nil {
fmt.Fprintf(os.Stderr, "failed to setup flag: %v", err)
os.Exit(1)
}
githubCmd.PersistentFlags().AddFlagSet(set)
if err := viper.BindPFlags(githubCmd.PersistentFlags()); err != nil {
fmt.Fprintf(os.Stderr, "failed to bind flags: %v", err)
os.Exit(1)
}
rootCmd.AddCommand(githubCmd)
}
148 changes: 145 additions & 3 deletions internal/client/githubclient/githubclient.go
Expand Up @@ -16,15 +16,16 @@
package githubclient

import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
"net/http"

"github.com/google/go-github/v50/github"
"github.com/guacsec/guac/internal/client"
"github.com/guacsec/guac/pkg/version"
"golang.org/x/oauth2"
"io"
"net/http"
)

// TODO (mlieberman85): This interface will probably be pulled out into an interface that can support other
Expand All @@ -46,6 +47,15 @@ type GithubClient interface {

// GetReleaseAsset fetches the content of a release asset, e.g. artifacts, metadata documents, etc.
GetReleaseAsset(asset client.ReleaseAsset) (*client.ReleaseAssetContent, error)

// GetWorkflow fetches the workflow for a given workflow name or all workflows if the workflow name is empty
GetWorkflow(ctx context.Context, owner string, repo string, githubWorkflowName string) ([]*client.Workflow, error)

// GetLatestWorkflowRun fetches all the workflow run for a given workflow id
GetLatestWorkflowRun(ctx context.Context, owner, repo string, workflowId int64) (*client.WorkflowRun, error)

// GetWorkflowRunArtifacts fetches all the workflow run artifacts for a given workflow run id
GetWorkflowRunArtifacts(ctx context.Context, owner, repo, githubSBOMName string, runID int64) ([]*client.WorkflowArtifactContent, error)
}

type githubClient struct {
Expand Down Expand Up @@ -111,6 +121,138 @@ func (gc *githubClient) GetLatestRelease(ctx context.Context, owner string, repo
return &release, nil
}

// GetWorkflow retrieves the workflow for a specified workflow name from a given GitHub repository.
// If the workflow name is not provided, it fetches all workflows for the repository.
// It returns an error if the workflow name is provided but not found in the repository.
func (gc *githubClient) GetWorkflow(ctx context.Context, owner, repo, githubWorkflowFileName string) ([]*client.Workflow, error) {
if githubWorkflowFileName != "" {
workflow, _, err := gc.ghClient.Actions.GetWorkflowByFileName(ctx, owner, repo, githubWorkflowFileName)
if err != nil {
return nil, fmt.Errorf("unable to get workflow by file name: %w", err)
}

return []*client.Workflow{
{
Name: *workflow.Name,
Id: *workflow.ID,
},
}, nil
}

workflows, _, err := gc.ghClient.Actions.ListWorkflows(ctx, owner, repo, nil)
if err != nil {
return nil, fmt.Errorf("unable to list workflows: %w", err)
}

var res []*client.Workflow

for _, workflow := range workflows.Workflows {
res = append(res, &client.Workflow{
Name: *workflow.Name,
Id: *workflow.ID,
})
}

return res, nil
}

// GetLatestWorkflowRun retrieves all the workflow runs associated with a specified workflow ID from a given GitHub repository.
// It returns an error if the workflow runs cannot be fetched.
func (gc *githubClient) GetLatestWorkflowRun(ctx context.Context, owner, repo string, workflowId int64) (*client.WorkflowRun, error) {
runs, _, err := gc.ghClient.Actions.ListWorkflowRunsByID(ctx, owner, repo, workflowId, nil)
if err != nil {
return nil, fmt.Errorf("unable to list workflow runs: %w", err)
}

if len(runs.WorkflowRuns) == 0 {
return nil, nil
}

// runs.WorkflowRuns is sorted by created_at in descending order so the first element is the latest run
return &client.WorkflowRun{WorkflowId: *runs.WorkflowRuns[0].WorkflowID, RunId: *runs.WorkflowRuns[0].ID}, nil
}

func (gc *githubClient) GetWorkflowRunArtifacts(ctx context.Context, owner, repo, githubSBOMName string, runID int64) ([]*client.WorkflowArtifactContent, error) {
var res []*client.WorkflowArtifactContent

// get workflow run artifacts
artifacts, _, err := gc.ghClient.Actions.ListWorkflowRunArtifacts(ctx, owner, repo, runID, nil)
if err != nil {
return nil, fmt.Errorf("unable to list workflow run artifacts: %w", err)
}
for _, j := range artifacts.Artifacts {
// If the githubSBOMName is empty, we want to return all artifacts
// Otherwise, we only want to return the artifacts that have the name of githubSBOMName
if githubSBOMName != "" && *j.Name != githubSBOMName {
continue
}

// download artifact
file, _, err := gc.ghClient.Actions.DownloadArtifact(ctx, owner, repo, j.GetID(), true)
if err != nil {
return nil, fmt.Errorf("unable to download artifact: %w", err)
}

arr, err := DownloadAndExtractZip(file.String(), runID)
if err != nil {
return nil, fmt.Errorf("unable to download and extract zip: %w", err)
}

res = append(res, arr...)
}

return res, nil
}

// DownloadAndExtractZip downloads a zip file from the given URL, extracts its contents,
// and returns only those files that are valid JSON.
func DownloadAndExtractZip(url string, runID int64) ([]*client.WorkflowArtifactContent, error) {
// Download the zip file
resp, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("error getting zip file: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("bad status: %s", resp.Status)
}

// Read the body into a buffer
buf := new(bytes.Buffer)
_, err = io.Copy(buf, resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading response body: %w", err)
}

// Read the zip archive
zipReader, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
if err != nil {
return nil, fmt.Errorf("error reading zip file: %w", err)
}

var files []*client.WorkflowArtifactContent

// Iterate through each file in the archive
for _, zipFile := range zipReader.File {
f, err := zipFile.Open()
if err != nil {
return nil, fmt.Errorf("error opening file in zip: %w", err)
}
defer f.Close()

fileData := make([]byte, zipFile.UncompressedSize64)
_, err = io.ReadFull(f, fileData)
if err != nil {
return nil, fmt.Errorf("error reading file data: %w", err)
}

files = append(files, &client.WorkflowArtifactContent{Name: zipFile.Name, Bytes: fileData, RunId: runID})
}

return files, nil
}

func (gc *githubClient) GetCommitSHA1(ctx context.Context, owner string, repo string, ref string) (string, error) {
commit, _, err := gc.ghClient.Repositories.GetCommitSHA1(ctx, owner, repo, ref, "")

Expand Down

0 comments on commit 52a55e4

Please sign in to comment.