From 7e58d22c4dc7059780990ce018e0124ed0c2b41b Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 02:25:59 +0000
Subject: [PATCH 01/12] bug: Fixes parallelization and improves output logic

1. Fixes #267 when gcp_scanner would hang indefinetely
2. Output file does not contain projects parent node anymore
3. Minor code formating using pyformat

Fixes #271. GCP Scanner does not rely on multiprocessing and future
frameworks anymore. We use threads and implement our own logic to
handle number workers count.

Added new option to control number of threads.
---
 src/gcp_scanner/arguments.py |  14 +-
 src/gcp_scanner/models.py    |   4 +-
 src/gcp_scanner/scanner.py   | 425 +++++++++++++++++++++--------------
 3 files changed, 271 insertions(+), 172 deletions(-)

diff --git a/src/gcp_scanner/arguments.py b/src/gcp_scanner/arguments.py
index 4147aaae..bfce9d07 100644
--- a/src/gcp_scanner/arguments.py
+++ b/src/gcp_scanner/arguments.py
@@ -129,11 +129,17 @@ def arg_parser():
       help='Save logs to the path specified rather than displaying in\
  console')
   parser.add_argument(
-      '-wc',
-      '--worker-count',
+      '-pwc',
+      '--project-worker-count',
       default=1,
-      dest='worker_count',
-      help='Set limit for workers run in parallel.')
+      dest='project_worker_count',
+      help='Set limit for project crawlers run in parallel.')
+  parser.add_argument(
+      '-rwc',
+      '--resource-worker-count',
+      default=1,
+      dest='resource_worker_count',
+      help='Set limit for resource crawlers run in parallel.')
 
   args: argparse.Namespace = parser.parse_args()
 
diff --git a/src/gcp_scanner/models.py b/src/gcp_scanner/models.py
index 1ce9ff35..8ae34ecc 100644
--- a/src/gcp_scanner/models.py
+++ b/src/gcp_scanner/models.py
@@ -58,7 +58,7 @@ def __init__(
     sa_name,
     credentials,
     chain_so_far,
-    worker_count
+    resource_worker_count
   ):
     self.project = project
     self.sa_results = sa_results
@@ -70,4 +70,4 @@ def __init__(
     self.sa_name = sa_name
     self.credentials = credentials
     self.chain_so_far = chain_so_far
-    self.worker_count = worker_count
+    self.resource_worker_count = resource_worker_count
diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
index 982919ad..5c3c1a2a 100644
--- a/src/gcp_scanner/scanner.py
+++ b/src/gcp_scanner/scanner.py
@@ -13,20 +13,19 @@
 # limitations under the License.
 
 
-"""The main module that initiates scanning of GCP resources.
-
-"""
+"""The main module that initiates scanning of GCP resources."""
 import collections
 import concurrent
+from datetime import datetime
 import json
+from json.decoder import JSONDecodeError
 import logging
-import multiprocessing
 import os
-import sys
-from datetime import datetime
-from json.decoder import JSONDecodeError
 from pathlib import Path
-from typing import List, Dict, Optional, Union, Any
+import sys
+import threading
+import time
+from typing import Any, Dict, List, Optional, Union
 
 from google.auth.exceptions import MalformedError
 from google.cloud import container_v1
@@ -39,61 +38,88 @@
 from . import models
 from . import scanner
 from .client.client_factory import ClientFactory
-from .crawler.crawler_factory import CrawlerFactory
 from .crawler import misc_crawler
+from .crawler.crawler_factory import CrawlerFactory
 
 # We define the schema statically to make it easier for the user and avoid extra
 # config files.
 LIGHT_VERSION_SCAN_SCHEMA = {
-  'compute_instances': ['name', 'zone', 'machineType', 'networkInterfaces',
-                        'status'],
-  'compute_images': ['name', 'status', 'diskSizeGb', 'sourceDisk'],
-  'machine_images': ['name', 'description', 'status', 'sourceInstance',
-                     'totalStorageBytes', 'savedDisks'],
-  'compute_disks': ['name', 'sizeGb', 'zone', 'status', 'sourceImage', 'users'],
-  'compute_snapshots': ['name', 'status', 'sourceDisk', 'downloadBytes'],
-  'managed_zones': ['name', 'dnsName', 'description', 'nameServers'],
-  'sql_instances': ['name', 'region', 'ipAddresses', 'databaseVersion',
-                    'state'],
-  'cloud_functions': ['name', 'eventTrigger', 'status', 'entryPoint',
-                      'serviceAccountEmail'],
-  'kms': ['name', 'primary', 'purpose', 'createTime'],
-  'services': ['name'],
+    'compute_instances': [
+        'name',
+        'zone',
+        'machineType',
+        'networkInterfaces',
+        'status',
+    ],
+    'compute_images': ['name', 'status', 'diskSizeGb', 'sourceDisk'],
+    'machine_images': [
+        'name',
+        'description',
+        'status',
+        'sourceInstance',
+        'totalStorageBytes',
+        'savedDisks',
+    ],
+    'compute_disks': [
+        'name',
+        'sizeGb',
+        'zone',
+        'status',
+        'sourceImage',
+        'users',
+    ],
+    'compute_snapshots': ['name', 'status', 'sourceDisk', 'downloadBytes'],
+    'managed_zones': ['name', 'dnsName', 'description', 'nameServers'],
+    'sql_instances': [
+        'name',
+        'region',
+        'ipAddresses',
+        'databaseVersion',
+        'state',
+    ],
+    'cloud_functions': [
+        'name',
+        'eventTrigger',
+        'status',
+        'entryPoint',
+        'serviceAccountEmail',
+    ],
+    'kms': ['name', 'primary', 'purpose', 'createTime'],
+    'services': ['name'],
 }
 
 # The following map is used to establish the relationship between
 # crawlers and clients. It determines the appropriate crawler and
 # client to be selected from the respective factory classes.
 CRAWL_CLIENT_MAP = {
-  'app_services': 'appengine',
-  'bigtable_instances': 'bigtableadmin',
-  'bq': 'bigquery',
-  'cloud_functions': 'cloudfunctions',
-  'compute_disks': 'compute',
-  'compute_images': 'compute',
-  'compute_instances': 'compute',
-  'compute_snapshots': 'compute',
-  'datastore_kinds': 'datastore',
-  'dns_policies': 'dns',
-  'endpoints': 'servicemanagement',
-  'firestore_collections': 'firestore',
-  'filestore_instances': 'file',
-  'firewall_rules': 'compute',
-  'iam_policy': 'cloudresourcemanager',
-  'kms': 'cloudkms',
-  'machine_images': 'compute',
-  'managed_zones': 'dns',
-  'project_info': 'cloudresourcemanager',
-  'pubsub_subs': 'pubsub',
-  'registered_domains': 'domains',
-  'services': 'serviceusage',
-  'service_accounts': 'iam',
-  'sourcerepos': 'sourcerepo',
-  'spanner_instances': 'spanner',
-  'sql_instances': 'sqladmin',
-  'static_ips': 'compute',
-  'storage_buckets': 'storage',
-  'subnets': 'compute',
+    'app_services': 'appengine',
+    'bigtable_instances': 'bigtableadmin',
+    'bq': 'bigquery',
+    'cloud_functions': 'cloudfunctions',
+    'compute_disks': 'compute',
+    'compute_images': 'compute',
+    'compute_instances': 'compute',
+    'compute_snapshots': 'compute',
+    'datastore_kinds': 'datastore',
+    'dns_policies': 'dns',
+    'endpoints': 'servicemanagement',
+    'firestore_collections': 'firestore',
+    'filestore_instances': 'file',
+    'firewall_rules': 'compute',
+    'iam_policy': 'cloudresourcemanager',
+    'kms': 'cloudkms',
+    'machine_images': 'compute',
+    'managed_zones': 'dns',
+    'pubsub_subs': 'pubsub',
+    'registered_domains': 'domains',
+    'services': 'serviceusage',
+    'service_accounts': 'iam',
+    'sourcerepos': 'sourcerepo',
+    'spanner_instances': 'spanner',
+    'sql_instances': 'sqladmin',
+    'static_ips': 'compute',
+    'storage_buckets': 'storage',
+    'subnets': 'compute',
 }
 
 
@@ -134,27 +160,45 @@ def save_results(res_data: Dict, res_path: str, is_light: bool):
     outfile.write(sa_results_data)
 
 
-def get_crawl(crawler, project_id, client, crawler_config):
-  return crawler.crawl(project_id, client, crawler_config)
+def get_crawl(
+    crawler, project_id, client, crawler_config, scan_results, crawler_name
+):
+  res = crawler.crawl(project_id, client, crawler_config)
+  if res is not None and len(res) != 0:
+    scan_results[crawler_name] = res
+  return scan_results
 
 
 def get_resources(project: models.ProjectInfo):
-  """The function crawls the data for a project and stores the results in a 
+  """The function crawls the data for a project and stores the results in a
+
      dictionary.
 
-  Args: 
+  Args:
     project: class to store project scan configration
   """
 
-  if project.target_project and \
-    project.target_project not in project.project['projectId']:
+  if (
+      project.target_project
+      and project.target_project not in project.project['projectId']
+  ):
     return
 
   project_id = project.project['projectId']
   print(f'Inspecting project {project_id}')
-  project_result = project.sa_results['projects'][project_id]
+  project_result = dict()
 
   project_result['project_info'] = project.project
+  project_result['service_account_chain'] = project.sa_results[
+      'service_account_chain'
+  ]
+  project_result['current_service_account'] = project.sa_results[
+      'current_service_account'
+  ]
+  project_result['token_scopes'] = project.sa_results['token_scopes']
+  project_result['service_account_edges'] = project.sa_results[project_id][
+      'service_account_edges'
+  ]
 
   # Fail with error if the output file already exists
   output_file_name = f'{project_id}-{project.scan_time_suffix}.json'
@@ -166,71 +210,90 @@ def get_resources(project: models.ProjectInfo):
       pass
 
   except FileExistsError:
-    logging.error('Try removing the %s file and restart the scanner.',
-                  output_file_name)
-
-  results_crawl_pool = dict()
-  with concurrent.futures.ThreadPoolExecutor(
-    max_workers=int(project.worker_count)) as executor:
-    for crawler_name, client_name in CRAWL_CLIENT_MAP.items():
-      if is_set(project.scan_config, crawler_name):
-        crawler_config = {}
-        if project.scan_config is not None:
-          crawler_config = project.scan_config.get(crawler_name)
-        # add gcs output path to the config.
-        # this path is used by the storage bucket crawler as of now.
-        crawler_config['gcs_output_path'] = gcs_output_path
-        # crawl the data
-        crawler = CrawlerFactory.create_crawler(crawler_name)
-        client = ClientFactory.get_client(client_name).get_service(
+    logging.error(
+        'Try removing the %s file and restart the scanner.', output_file_name
+    )
+
+  threads_list = list()
+  for i, (crawler_name, client_name) in enumerate(CRAWL_CLIENT_MAP.items()):
+    if is_set(project.scan_config, crawler_name):
+      crawler_config = {}
+      if project.scan_config is not None:
+        crawler_config = project.scan_config.get(crawler_name)
+
+      # add gcs output path to the config.
+      # this path is used by the storage bucket crawler as of now.
+      crawler_config['gcs_output_path'] = gcs_output_path
+
+      # crawl the data
+      crawler = CrawlerFactory.create_crawler(crawler_name)
+      client = ClientFactory.get_client(client_name).get_service(
           project.credentials,
-        )
-        results_crawl_pool[crawler_name] = executor.submit(
-          get_crawl,
-          crawler,
-          project_id,
-          client,
-          crawler_config,
-        )
+      )
 
-  for crawler_name, future_obj in results_crawl_pool.items():
-    project_result[crawler_name] = future_obj.result()
+      t = threading.Thread(
+          target=get_crawl,
+          args=(
+              crawler,
+              project_id,
+              client,
+              crawler_config,
+              project_result,
+              crawler_name,
+          ),
+      )
+      t.setDaemon(True)
+      t.start()
+      threads_list.append(t)
+
+      while True:
+        active_threads = 0
+        for t in threads_list:
+          if t.is_alive():
+            active_threads += 1
+        if active_threads >= project.resource_worker_count:
+          time.sleep(0.1)
+        else:
+          break
+
+  for t in threads_list:
+    t.join()
 
   # Call other miscellaneous crawlers here
   if is_set(project.scan_config, 'gke_clusters'):
     gke_client = gke_client_for_credentials(project.credentials)
-    project_result['gke_clusters'] = misc_crawler.get_gke_clusters(
-      project_id,
-      gke_client,
+    res = misc_crawler.get_gke_clusters(
+        project_id,
+        gke_client,
     )
+    if res is not None and len(res) != 0:
+      project_result['gke_clusters'] = res
   if is_set(project.scan_config, 'gke_images'):
-    project_result['gke_images'] = misc_crawler.get_gke_images(
-      project_id,
-      project.credentials.token,
+    res = misc_crawler.get_gke_images(
+        project_id,
+        project.credentials.token,
     )
+    if res is not None and len(res) != 0:
+      project_result['gke_clusters'] = res
 
   logging.info('Saving results for %s into the file', project_id)
-
-  save_results(project.sa_results, output_path, project.light_scan)
-  # Clean memory to avoid leak for large amount projects.
-  project.sa_results.clear()
+  save_results(project_result, output_path, project.light_scan)
 
 
 def impersonate_service_accounts(
-  context,
-  project,
-  scan_config,
-  sa_results,
-  chain_so_far,
-  sa_name,
-  credentials
+    context,
+    project,
+    scan_config,
+    sa_results,
+    chain_so_far,
+    sa_name,
+    credentials,
 ):
-  """The function enumerates projects accessible by SA and impersonates them.
-  """
+  """The function enumerates projects accessible by SA and impersonates them."""
 
   # Enumerate projects accessible by SA
   project_id = project['projectId']
-  print(f'Inspecting project {project_id} for Impersonation')
+  print(f'Looking for impersonation options in {project_id}')
   project_result = sa_results['projects'][project_id]
   project_result['project_info'] = project
   # Iterate over discovered service accounts by attempting impersonation
@@ -247,10 +310,10 @@ def impersonate_service_accounts(
     iam_client = iam_client_for_credentials(credentials)
     if is_set(scan_config, 'iam_policy') is False:
       iam_policy = CrawlerFactory.create_crawler('iam_policy').crawl(
-        project_id,
-        ClientFactory.get_client('cloudresourcemanager').get_service(
-          credentials,
-        ),
+          project_id,
+          ClientFactory.get_client('cloudresourcemanager').get_service(
+              credentials,
+          ),
       )
 
     project_service_accounts = get_sas_for_impersonation(iam_policy)
@@ -258,29 +321,36 @@ def impersonate_service_accounts(
       try:
         logging.info('Trying %s', candidate_service_account)
         creds_impersonated = credsdb.impersonate_sa(
-          iam_client, candidate_service_account)
+            iam_client, candidate_service_account
+        )
         context.service_account_queue.put(
-          (candidate_service_account, creds_impersonated, updated_chain))
+            (candidate_service_account, creds_impersonated, updated_chain)
+        )
         project_result['service_account_edges'].append(
-          candidate_service_account)
-        logging.info('Successfully impersonated %s using %s',
-                      candidate_service_account, sa_name)
+            candidate_service_account
+        )
+        logging.info(
+            'Successfully impersonated %s using %s',
+            candidate_service_account,
+            sa_name,
+        )
       except Exception:
-        logging.error('Failed to get token for %s',
-                      candidate_service_account)
+        logging.error('Failed to get token for %s', candidate_service_account)
         logging.error(sys.exc_info()[1])
 
 
 def iam_client_for_credentials(
-  credentials: Credentials) -> IAMCredentialsClient:
+    credentials: Credentials,
+) -> IAMCredentialsClient:
   return iam_credentials.IAMCredentialsClient(credentials=credentials)
 
 
 def gke_client_for_credentials(
-  credentials: Credentials
+    credentials: Credentials,
 ) -> container_v1.services.cluster_manager.client.ClusterManagerClient:
   return container_v1.services.cluster_manager.ClusterManagerClient(
-    credentials=credentials)
+      credentials=credentials
+  )
 
 
 def get_sa_details_from_key_files(key_path):
@@ -309,8 +379,7 @@ def get_sa_details_from_key_files(key_path):
   return sa_details
 
 
-def get_sas_for_impersonation(
-  iam_policy: List[Dict[str, Any]]) -> List[str]:
+def get_sas_for_impersonation(iam_policy: List[Dict[str, Any]]) -> List[str]:
   """Extract a list of unique SAs from IAM policy associated with project.
 
   Args:
@@ -344,7 +413,8 @@ def infinite_defaultdict():
 
 
 def get_sa_tuples(args):
-  """The function extracts service account (SA) credentials from various 
+  """The function extracts service account (SA) credentials from various
+
   sources and returns a list of tuples.
   """
 
@@ -376,8 +446,9 @@ def get_sa_tuples(args):
           continue
 
         logging.info('Retrieving credentials for %s', account_name)
-        credentials = credsdb.get_creds_from_data(access_token,
-                                                  json.loads(account_creds))
+        credentials = credsdb.get_creds_from_data(
+            access_token, json.loads(account_creds)
+        )
         if credentials is None:
           logging.error('Failed to retrieve access token for %s', account_name)
           continue
@@ -406,19 +477,22 @@ def get_sa_tuples(args):
 
   return sa_tuples
 
+
 def main():
-  """The main scanner loop for GCP Scanner
-  """
+  """The main scanner loop for GCP Scanner"""
 
   logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
   logging.getLogger('googleapiclient.http').setLevel(logging.ERROR)
 
   args = arguments.arg_parser()
 
-  logging.basicConfig(level=getattr(logging, args.log_level.upper(), None),
-                      format='%(asctime)s - %(levelname)s - %(message)s',
-                      datefmt='%Y-%m-%d %H:%M:%S',
-                      filename=args.log_file, filemode='a')
+  logging.basicConfig(
+      level=getattr(logging, args.log_level.upper(), None),
+      format='%(asctime)s - %(levelname)s - %(message)s',
+      datefmt='%Y-%m-%d %H:%M:%S',
+      filename=args.log_file,
+      filemode='a',
+  )
 
   force_projects_list = list()
   if args.force_projects:
@@ -436,7 +510,7 @@ def main():
 
   context = models.SpiderContext(sa_tuples)
 
-  project_queue = multiprocessing.Queue()
+  project_queue = list()
   processed_sas = set()
 
   while not context.service_account_queue.empty():
@@ -457,9 +531,11 @@ def main():
     sa_results['token_scopes'] = credentials.scopes
 
     project_list = CrawlerFactory.create_crawler(
-      'project_list',
+        'project_list',
     ).crawl(
-      ClientFactory.get_client('cloudresourcemanager').get_service(credentials),
+        ClientFactory.get_client('cloudresourcemanager').get_service(
+            credentials
+        ),
     )
 
     if len(project_list) <= 0:
@@ -468,53 +544,70 @@ def main():
     if force_projects_list:
       for force_project_id in force_projects_list:
         res = CrawlerFactory.create_crawler(
-          'project_info',
+            'project_info',
         ).crawl(
-          force_project_id,
-          ClientFactory.get_client('cloudresourcemanager').get_service(
-            credentials,
-          ),
+            force_project_id,
+            ClientFactory.get_client('cloudresourcemanager').get_service(
+                credentials,
+            ),
         )
         if res:
           project_list.append(res)
         else:
           # force object creation anyway
-          project_list.append({'projectId': force_project_id,
-                               'projectNumber': 'N/A'})
+          project_list.append(
+              {'projectId': force_project_id, 'projectNumber': 'N/A'}
+          )
 
     # Enumerate projects accessible by SA
     for project in project_list:
       project_obj = models.ProjectInfo(
-        project,
-        sa_results,
-        args.output,
-        scan_config,
-        args.light_scan,
-        args.target_project,
-        scan_time_suffix,
-        sa_name,
-        credentials,
-        chain_so_far,
-        args.worker_count
+          project,
+          sa_results,
+          args.output,
+          scan_config,
+          args.light_scan,
+          args.target_project,
+          scan_time_suffix,
+          sa_name,
+          credentials,
+          chain_so_far,
+          int(args.resource_worker_count),
       )
-      project_queue.put(project_obj)
+      project_queue.append(project_obj)
       impersonate_service_accounts(
-        context,
-        project,
-        scan_config,
-        sa_results,
-        chain_so_far,
-        sa_name,
-        credentials
+          context,
+          project,
+          scan_config,
+          sa_results,
+          chain_so_far,
+          sa_name,
+          credentials,
       )
 
-  pool = multiprocessing.Pool(
-    processes=min(int(args.worker_count), os.cpu_count()))
+  all_thread_handles = list()
 
-  while not project_queue.empty():
-    pool.apply_async(scanner.get_resources, args=(project_queue.get(),))
+  # See i#267 on why we use the native threading approach here.
+  for i, project_obj in enumerate(project_queue):
+    print('Finished %d projects out of %d' % (i, len(project_queue) - 1))
+    sync_t = threading.Thread(target=scanner.get_resources, args=(project_obj,))
+    sync_t.setDaemon(True)
+    sync_t.start()
+    all_thread_handles.append(sync_t)
+
+    while True:  # enforce explicit block on number of threads
+      active_threads = 0
+      for t in all_thread_handles:
+        if t.is_alive():
+          active_threads += 1
+
+      if active_threads >= int(args.project_worker_count):
+        time.sleep(0.1)
+      else:
+        break
 
-  pool.close()
-  pool.join()
+  # wait for any threads left to finish
+  for t in all_thread_handles:
+    t.join()
 
   return 0

From c270da9d7ff4b314c3ee793b88d44a99157ed564 Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 02:41:49 +0000
Subject: [PATCH 02/12] making pylint happy

---
 src/gcp_scanner/scanner.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
index 5c3c1a2a..26c05efb 100644
--- a/src/gcp_scanner/scanner.py
+++ b/src/gcp_scanner/scanner.py
@@ -15,7 +15,6 @@
 
 """The main module that initiates scanning of GCP resources."""
 import collections
-import concurrent
 from datetime import datetime
 import json
 from json.decoder import JSONDecodeError
@@ -215,7 +214,7 @@ def get_resources(project: models.ProjectInfo):
     )
 
   threads_list = list()
-  for i, (crawler_name, client_name) in enumerate(CRAWL_CLIENT_MAP.items()):
+  for crawler_name, client_name in CRAWL_CLIENT_MAP.items():
     if is_set(project.scan_config, crawler_name):
       crawler_config = {}
       if project.scan_config is not None:
@@ -242,7 +241,7 @@ def get_resources(project: models.ProjectInfo):
               crawler_name,
           ),
       )
-      t.setDaemon(True)
+      t.daemon = True
       t.start()
       threads_list.append(t)
 
@@ -591,7 +590,7 @@ def main():
   for i, project_obj in enumerate(project_queue):
     print('Finished %d projects out of %d' % (i, len(project_queue) - 1))
     sync_t = threading.Thread(target=scanner.get_resources, args=(project_obj,))
-    sync_t.setDaemon(True)
+    sync_t.daemon = True
     sync_t.start()
     all_thread_handles.append(sync_t)
 

From a6cfd3365ee9f419a947dd07301f73b705424d7a Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 02:47:42 +0000
Subject: [PATCH 03/12] fixing test acceptance

---
 src/gcp_scanner/test_acceptance.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index 867d90fe..a856facf 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -63,10 +63,8 @@ def check_obj_entry(res_dict, subojects_count, entry_name, volatile=False):
 def validate_result():
   file_name = os.listdir("res/")[0]
   with open("res/" + file_name, "r", encoding="utf-8") as f:
-    res_data = json.load(f)
+    project = json.load(f)
 
-  # project
-  project = res_data["projects"].get("test-gcp-scanner-2", None)
   assert project is not None
   assert len(project) == RESOURCE_COUNT
 

From d08bdf7c3c4b297812dd60a3e9a2a62687515a4e Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 02:52:09 +0000
Subject: [PATCH 04/12] we print results of scan in tests

---
 src/gcp_scanner/test_acceptance.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index a856facf..66cfb8f5 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -65,6 +65,7 @@ def validate_result():
   with open("res/" + file_name, "r", encoding="utf-8") as f:
     project = json.load(f)
 
+  print(project)
   assert project is not None
   assert len(project) == RESOURCE_COUNT
 

From d1f19e0333dacd0d0c73753f6ac390a517f8248a Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 02:57:14 +0000
Subject: [PATCH 05/12] printing results in JSON

---
 src/gcp_scanner/test_acceptance.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index 66cfb8f5..d9c83f80 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -18,6 +18,7 @@
 
 import json
 import os
+import sys
 import unittest.mock
 
 from . import scanner
@@ -65,7 +66,7 @@ def validate_result():
   with open("res/" + file_name, "r", encoding="utf-8") as f:
     project = json.load(f)
 
-  print(project)
+  json.dump(project, sys.stdout)
   assert project is not None
   assert len(project) == RESOURCE_COUNT
 

From ed75ea5d85c4eff0906cc51630cfeb18690fe2ad Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 03:04:22 +0000
Subject: [PATCH 06/12] changing order of tests

---
 src/gcp_scanner/test_acceptance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index d9c83f80..8e8717c8 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -68,7 +68,6 @@ def validate_result():
 
   json.dump(project, sys.stdout)
   assert project is not None
-  assert len(project) == RESOURCE_COUNT
 
   check_obj_entry(project, PROJECT_INFO_COUNT, "project_info")
   check_obj_entry(project, IAM_POLICY_COUNT, "iam_policy")
@@ -106,6 +105,7 @@ def validate_result():
 
   check_obj_entry(project, SERVICES_COUNT, "services")
 
+  assert len(project) == RESOURCE_COUNT
 
 def test_acceptance():
   os.mkdir("res")

From 8a0cb3b988a5815f288a43081a3b2503127eb02f Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 03:45:06 +0000
Subject: [PATCH 07/12] now we have GKE clusters test

---
 src/gcp_scanner/test_acceptance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index 8e8717c8..a831cf31 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -37,7 +37,7 @@
 APP_SERVICES_COUNT = 2
 STORAGE_BUCKETS_COUNT = 5
 MANAGED_ZONES_COUNT = 2
-GKE_CLUSTERS_COUNT = 0
+GKE_CLUSTERS_COUNT = 4
 GKE_IMAGES_COUNT = 4
 SQL_INSTANCES_COUNT = 1
 BQ_COUNT = 1

From 19231a81c6792b7d59f8ea2b379fa71654a44717 Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 03:50:33 +0000
Subject: [PATCH 08/12] fixing bug with gke_images

---
 src/gcp_scanner/scanner.py         | 2 +-
 src/gcp_scanner/test_acceptance.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
index 26c05efb..90d13c1a 100644
--- a/src/gcp_scanner/scanner.py
+++ b/src/gcp_scanner/scanner.py
@@ -273,7 +273,7 @@ def get_resources(project: models.ProjectInfo):
         project.credentials.token,
     )
     if res is not None and len(res) != 0:
-      project_result['gke_clusters'] = res
+      project_result['gke_images'] = res
 
   logging.info('Saving results for %s into the file', project_id)
   save_results(project_result, output_path, project.light_scan)
diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index a831cf31..8e8717c8 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -37,7 +37,7 @@
 APP_SERVICES_COUNT = 2
 STORAGE_BUCKETS_COUNT = 5
 MANAGED_ZONES_COUNT = 2
-GKE_CLUSTERS_COUNT = 4
+GKE_CLUSTERS_COUNT = 0
 GKE_IMAGES_COUNT = 4
 SQL_INSTANCES_COUNT = 1
 BQ_COUNT = 1

From 20d61793e808f7262960a725d4786de59f99fec6 Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 03:56:50 +0000
Subject: [PATCH 09/12] Support new format when no output is given

---
 src/gcp_scanner/test_acceptance.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index 8e8717c8..3b00da5a 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -54,6 +54,10 @@
 
 def check_obj_entry(res_dict, subojects_count, entry_name, volatile=False):
   obj = res_dict.get(entry_name, None)
+  if subojects_count == 0:
+    assert obj is None
+    return
+
   if volatile is True:
     assert obj is not None and (len(obj) == subojects_count or \
                                 len(obj) == subojects_count - 1)

From 767eb43a7d4119d6421dfc9f01d17802c6944539 Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 04:00:28 +0000
Subject: [PATCH 10/12] reducing number of resources

---
 src/gcp_scanner/test_acceptance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
index 3b00da5a..a2dae004 100644
--- a/src/gcp_scanner/test_acceptance.py
+++ b/src/gcp_scanner/test_acceptance.py
@@ -23,7 +23,7 @@
 
 from . import scanner
 
-RESOURCE_COUNT = 31
+RESOURCE_COUNT = 30
 RESULTS_JSON_COUNT = 1
 PROJECT_INFO_COUNT = 5
 IAM_POLICY_COUNT = 12

From dca72f839725220d8e33c4b15b9b437833615c9b Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 04:05:19 +0000
Subject: [PATCH 11/12] Update README.md

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 40e1ea8c..270b3838 100644
--- a/README.md
+++ b/README.md
@@ -65,20 +65,20 @@ There is a docker build file if you want to run the scanner from a container:
 ### Command-line options
 
 ```
-usage: gcp-scanner -o /folder_to_save_results/ -g -
+usage: python3 scanner.py -o folder_to_save_results -g -
 
 GCP Scanner
 
 options:
   -h, --help            show this help message and exit
+  -ls, --light-scan     Return only the most important GCP resource fields in the output.
   -k KEY_PATH, --sa-key-path KEY_PATH
                         Path to directory with SA keys in json format
   -g GCLOUD_PROFILE_PATH, --gcloud-profile-path GCLOUD_PROFILE_PATH
                         Path to directory with gcloud profile. Specify - to search for credentials in default gcloud config path
   -m, --use-metadata    Extract credentials from GCE instance metadata
   -at ACCESS_TOKEN_FILES, --access-token-files ACCESS_TOKEN_FILES
-                        A list of comma separated files with access token and OAuth scopes.TTL limited. A token and scopes should be stored in JSON
-                        format.
+                        A list of comma separated files with access token and OAuth scopes.TTL limited. A token and scopes should be stored in JSON format.
   -rt REFRESH_TOKEN_FILES, --refresh-token-files REFRESH_TOKEN_FILES
                         A list of comma separated files with refresh_token, client_id,token_uri and client_secret stored in JSON format.
   -s KEY_NAME, --service-account KEY_NAME
@@ -89,10 +89,14 @@ options:
                         Comma separated list of project names to include in the scan
   -c CONFIG_PATH, --config CONFIG_PATH
                         A path to config file with a set of specific resources to scan.
-  -l {INFO,WARNING,ERROR}, --logging {INFO,WARNING,ERROR}
+  -l {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --logging {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                         Set logging level (INFO, WARNING, ERROR)
-  -lf LOG_DIRECTORY, --log-file LOG_DIRECTORY
+  -lf LOG_FILE, --log-file LOG_FILE
                         Save logs to the path specified rather than displaying in console
+  -pwc PROJECT_WORKER_COUNT, --project-worker-count PROJECT_WORKER_COUNT
+                        Set limit for project crawlers run in parallel.
+  -rwc RESOURCE_WORKER_COUNT, --resource-worker-count RESOURCE_WORKER_COUNT
+                        Set limit for resource crawlers run in parallel.
 
 Required parameters:
   -o OUTPUT, --output-dir OUTPUT

From 37af3367af7066ea4b10aba4a2c9278ba2c31f80 Mon Sep 17 00:00:00 2001
From: Maksim Shudrak <mshudrak@google.com>
Date: Sat, 16 Sep 2023 04:15:43 +0000
Subject: [PATCH 12/12] get_crawl function description

---
 src/gcp_scanner/scanner.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
index 90d13c1a..f2280f05 100644
--- a/src/gcp_scanner/scanner.py
+++ b/src/gcp_scanner/scanner.py
@@ -160,8 +160,27 @@ def save_results(res_data: Dict, res_path: str, is_light: bool):
 
 
 def get_crawl(
-    crawler, project_id, client, crawler_config, scan_results, crawler_name
+    crawler: Any,
+    project_id: str,
+    client: Any,
+    crawler_config: dict,
+    scan_results: dict,
+    crawler_name: str,
 ):
+  """The function calls the crawler and returns result in dictionary
+
+  Args:
+    crawler: crawler method to start
+    project_id: id of a project to scan
+    client: appropriate client method
+    crawler_config: a dictionary containing specific parameters for a crawler
+    scan_results: a dictionary to save scanning results
+    crawler_name: name of a crawler
+
+  Returns:
+    scan_result: a dictionary with scanning results
+  """
+
   res = crawler.crawl(project_id, client, crawler_config)
   if res is not None and len(res) != 0:
     scan_results[crawler_name] = res