Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion exporters/gcp/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (

const (
// Name contains name of the exporter.
Name = "gcp"
Name = "GCP"
)

// Exporter is an instance of GCP Exporter.
Expand Down
2 changes: 1 addition & 1 deletion exporters/postgres/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import (

const (
// Name contains name of the exporter.
Name = "postgre"
Name = "postgres"
)

// Exporter is an instance of Postgres Exporter.
Expand Down
26 changes: 13 additions & 13 deletions hashr.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@ import (
)

var (
processingWorkerCount = flag.Int("processing_worker_count", 2, "Number of processing workers.")
importersToRun = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
exportersToRun = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
jobStorage = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
cacheDir = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
export = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
exportPath = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
reprocess = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
spannerDBPath = flag.String("spanner_db_path", "", "Path to spanner DB.")
uploadPayloads = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
cloudSpannerWorkerCount = flag.Int("cloudspanner_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
gcpExporterGCSbucket = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")
processingWorkerCount = flag.Int("processing_worker_count", 2, "Number of processing workers.")
importersToRun = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
exportersToRun = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
jobStorage = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
cacheDir = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
export = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
exportPath = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
reprocess = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
spannerDBPath = flag.String("spanner_db_path", "", "Path to spanner DB.")
uploadPayloads = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
gcpExporterWorkerCount = flag.Int("gcp_exporter_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
gcpExporterGCSbucket = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")

// Postgres DB flags
postgresHost = flag.String("postgres_host", "localhost", "PostgreSQL instance address.")
Expand Down Expand Up @@ -156,7 +156,7 @@ func main() {
glog.Exitf("Could not initialize GCP Storage client: %v", err)
}

gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *cloudSpannerWorkerCount)
gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *gcpExporterWorkerCount)
if err != nil {
glog.Exitf("Error initializing Postgres exporter: %v", err)
}
Expand Down
32 changes: 23 additions & 9 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
- [WSUS](#wsus)
- [Setting up exporters](#setting-up-exporters)
- [Setting up Postgres exporter](#setting-up-postgres-exporter)
- [Setting up Cloud Spanner exporter](#setting-up-cloud-spanner-exporter)
- [Setting up GCP exporter](#setting-up-gcp-exporter)
- [Additional flags](#additional-flags)

## About
Expand Down Expand Up @@ -369,27 +369,41 @@ If you didn't choose Postgres for processing job storage follow steps 1 & 2 from

This is currently the default exporter, you don't need to explicitly enable it. By default the content of the actual files won't be uploaded to PostgreSQL DB, if you wish to change that use `-upload_payloads true` flag.

In order for the Postgres exporter to work you need to set the following flags: `-postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`
In order for the Postgres exporter to work you need to set the following flags: `-exporters postgres -postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`

#### Setting up Cloud Spanner exporter
#### Setting up GCP exporter

Cloud Spanner exporter allows sending of hashes, file metadata and the actual content of the file to a GCP Spanner instance. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables:
GCP exporter allows sending of hashes, file metadata to GCP Spanner instance. Optionally you can upload the extracted files to GCS bucket. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables:

``` shell
gcloud spanner databases ddl update hashr --instance=hashr --ddl-file=scripts/CreateCloudSpannerExporterTables.ddl
```

If you have already set up Cloud Spanner for storing jobs data you just need to the run the command above and you're ready to go.

If you'd like to upload the extracted files to GCS you need to create the GCS bucket:

Step 1: Make the service account admin of this bucket:
``` shell
gsutil mb -p project_name> gs://<gcs_bucket_name>
```

Step 2: Make the service account admin of this bucket:
``` shell
gsutil iam ch serviceAccount:hashr@<project_name>.iam.gserviceaccount.com:objectAdmin gs://<gcs_bucket_name>
```

To use this exporter you need to provide the following flags: `-exporters GCP -gcp_exporter_gcs_bucket <gcs_bucket_name>`

### Additional flags

1. `-processingWorkerCount`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution.
1. `-cacheDir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
1. `-processing_worker_count`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution.
1. `-cache_dir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
1. `-export`: When set to false hashr will save the results to disk bypassing the exporter.
1. `-exportPath`: If export is set to false, this is the folder where samples will be saved.
1. `-export_path`: If export is set to false, this is the folder where samples will be saved.
1. `-reprocess`: Allows to reprocess a given source (in case it e.g. errored out) based on the sha256 value stored in the jobs table.
1. `-uploadPayloads`: Controls if the actual content of the file will be uploaded by defined exporters.
2. `-cloudSpannerWorkerCount`: Number of workers/goroutines that will be used to upload data to Cloud Spanner.
1. `-upload_payloads`: Controls if the actual content of the file will be uploaded by defined exporters.
2. `-gcp_exporter_worker_count`: Number of workers/goroutines that the GCP exporter will use to upload the data.


This is not an officially supported Google product.