GCS scanner fix, custom scopes support and more

1. Fixing an issue with GCS file scanner that can result in infinite loop. 2. Fixing GCP VM metadata scope pulling issue. 3. Adding support for a list of files with access tokens. 4. Adding support for user to set custom scopes for AT and RT. 5. Project results are now saved in individual files. 6. GCS list of files is now saved in a separate file rather than project result.
google · Dec 5, 2022 · 6e2ff4b · 6e2ff4b
1 parent 6c6614f
commit 6e2ff4b
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 49 deletions.
diff --git a/src/gcp_scanner/crawl.py b/src/gcp_scanner/crawl.py
@@ -18,7 +18,9 @@
 """
 
 import collections
+import json
 import logging
+import io
 import sys
 from typing import List, Dict, Any, Tuple
 
@@ -325,18 +327,19 @@ def get_firewall_rules(
 
 
 def get_bucket_names(project_name: str, credentials: Credentials,
-                     enum_files: bool) -> Dict[str, Tuple[Any, List[Any]]]:
+                     dump_fd: io.TextIOWrapper
+                     ) -> Dict[str, Tuple[Any, List[Any]]]:
   """Retrieve a list of buckets available in the project.
 
   Args:
     project_name: A name of a project to query info about.
     credentials: An google.oauth2.credentials.Credentials object.
-    enum_files: If true, the function will enumerate files stored in buckets.
+    dump_fd: If set, the function will enumerate files stored in buckets and
+      save them in a file corresponding to provided file descriptor.
       This is a very slow, noisy operation and should be used with caution.
 
   Returns:
-    A dictionary where key is bucket name and value is a tuple of
-    a bucket Object and list of file objects associated with bucket.
+    A dictionary where key is bucket name and value is a bucket Object.
   """
 
   logging.info("Retrieving GCS Buckets")
@@ -352,28 +355,28 @@ def get_bucket_names(project_name: str, credentials: Credentials,
       logging.info("Failed to list buckets in the %s", project_name)
       logging.info(sys.exc_info())
       break
+
     for bucket in response.get("items", []):
       buckets_dict[bucket["name"]] = (bucket, None)
-      if enum_files is True:
+      if dump_fd is not None:
         ret_fields = "nextPageToken,items(name,size,contentType,timeCreated)"
 
         req = service.objects().list(bucket=bucket["name"], fields=ret_fields)
 
-        all_objects = []
         while req:
           try:
             resp = req.execute()
-            all_objects.extend(resp.get("items", []))
+            for item in resp.get("items", []):
+              dump_fd.write(json.dumps(item, indent=2, sort_keys=False))
+
+            req = service.objects().list_next(req, resp)
           except googleapiclient.errors.HttpError:
             logging.info("Failed to read the bucket %s", bucket["name"])
             logging.info(sys.exc_info())
-            continue
-          req = service.objects().list_next(req, resp)
+            break
 
-        buckets_dict[bucket["name"]] = (bucket, all_objects)
-
-      request = service.buckets().list_next(
-          previous_request=request, previous_response=response)
+    request = service.buckets().list_next(
+        previous_request=request, previous_response=response)
 
   return buckets_dict
 

diff --git a/src/gcp_scanner/credsdb.py b/src/gcp_scanner/credsdb.py
@@ -32,10 +32,6 @@
 from httplib2 import Credentials
 import requests
 
-# Permissions to request for Access Token
-scopes = "https://www.googleapis.com/auth/cloud-platform"
-
-expires_in = 3600  # Expires in 1 hour
 
 credentials_db_search_places = ["/home/", "/root/"]
 
@@ -125,7 +121,7 @@ def get_creds_from_metadata() -> Tuple[Optional[str], Optional[Credentials]]:
 
   print("Successfully retrieved instance metadata")
   print("Access token length: %d" % len(token), "Instance email: %s" % email,
-        "Instance scopes: %s" % scopes)
+        "Instance scopes: %s" % instance_scopes)
   return email, credentials_from_token(token, None, None, None, None,
                                        instance_scopes)
 
@@ -313,24 +309,79 @@ def impersonate_sa(iam_client: IAMCredentialsClient,
                                 None, None, None, scopes_sa)
 
 
+def creds_from_access_token(access_token_file):
+  """The function is used to obtain Google Auth Credentials from access token.
+
+  Args:
+    access_token_file: a path to a file with access token and scopes stored in
+    JSON format. Example:
+      {
+        "access_token": "<token>",
+        "scopes": [
+          "https://www.googleapis.com/auth/devstorage.read_only",
+          "https://www.googleapis.com/auth/logging.write",
+          "https://www.googleapis.com/auth/monitoring.write",
+          "https://www.googleapis.com/auth/servicecontrol",
+          "https://www.googleapis.com/auth/service.management.readonly",
+          "https://www.googleapis.com/auth/trace.append"
+        ]
+      }
+
+  Returns:
+    google.auth.service_account.Credentials: The constructed credentials.
+  """
+
+  with open(access_token_file, encoding="utf-8") as f:
+    creds_dict = json.load(f)
+
+  user_scopes = creds_dict.get("scopes", None)
+  if user_scopes is None:
+    user_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
+
+  return credentials_from_token(
+            creds_dict["access_token"],
+            None,
+            None,
+            None,
+            None,
+            user_scopes)
+
+
 def creds_from_refresh_token(refresh_token_file):
   """The function is used to obtain Google Auth Credentials from refresh token.
 
   Args:
     refresh_token_file: a path to a file with refresh_token, client_id,
       client_secret, and token_uri stored in JSON format.
-
+    Example:
+      {
+        "refresh_token": "<token>",
+        "client_id": "id",
+        "client_secret": "secret",
+        scopes: [
+          https://www.googleapis.com/auth/devstorage.read_only,
+          https://www.googleapis.com/auth/logging.write,
+          https://www.googleapis.com/auth/monitoring.write,
+          https://www.googleapis.com/auth/servicecontrol,
+          https://www.googleapis.com/auth/service.management.readonly,
+          https://www.googleapis.com/auth/trace.append
+        ]
+      }
   Returns:
     google.auth.service_account.Credentials: The constructed credentials.
   """
 
   with open(refresh_token_file, encoding="utf-8") as f:
     creds_dict = json.load(f)
 
+  user_scopes = creds_dict.get("scopes", None)
+  if user_scopes is None:
+    user_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
+
   return credentials.Credentials(
       None,
       refresh_token=creds_dict["refresh_token"],
       token_uri=creds_dict["token_uri"],
       client_id=creds_dict["client_id"],
       client_secret=creds_dict["client_secret"],
-      scopes=["https://www.googleapis.com/auth/cloud-platform"])
+      scopes=user_scopes)
diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
@@ -82,8 +82,9 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
         if res:
           project_list.append(res)
         else:
-          # force object creation
-          project_list.append({'projectId': force_project_id, 'projectNumber': 'N/A'})
+          # force object creation anyway
+          project_list.append({'projectId': force_project_id,
+                               'projectNumber': "N/A"})
 
     # Enumerate projects accessible by SA
     for project in project_list:
@@ -147,16 +148,17 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
 
       # Get storage buckets
       if is_set(scan_config, 'storage_buckets'):
-        fetch_bucket_names = False
+        dump_file_names = None
         if scan_config is not None:
           obj = scan_config.get('storage_buckets', None)
-          if obj is not None:
-            fetch_bucket_names = obj.get('fetch_file_names', False)
+          if obj is not None and obj.get('fetch_file_names', False) is True:
+            dump_file_names = open(out_dir + '/%s.gcs' % project_id, 'w',
+                                   encoding='utf-8')
         project_result['storage_buckets'] = crawl.get_bucket_names(project_id,
-                                                credentials, fetch_bucket_names)
+                                                credentials, dump_file_names)
 
       # Get DNS managed zones
-      if is_set(scan_config, 'storage_buckets'):
+      if is_set(scan_config, 'managed_zones'):
         project_result['managed_zones'] = crawl.get_managed_zones(project_id,
                                                                   credentials)
 
@@ -230,7 +232,7 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
             iam_policy)
 
         for candidate_service_account in project_service_accounts:
-          print('Trying %s' % candidate_service_account)
+          logging.info('Trying %s' % candidate_service_account)
           if not candidate_service_account.startswith('serviceAccount'):
             continue
           try:
@@ -240,20 +242,24 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
                 (candidate_service_account, creds_impersonated, updated_chain))
             project_result['service_account_edges'].append(
                 candidate_service_account)
-            print('Successfully impersonated %s using %s ' %
+            logging.info('Successfully impersonated %s using %s ' %
                   (candidate_service_account, sa_name))
           except Exception:
             logging.info('Failed to get token for %s',
                                                       candidate_service_account)
             logging.info(sys.exc_info()[1])
 
-    # Write out results to json DB
-    logging.info('Saving results into the file')
+      # Write out results to json DB
+      logging.info('Saving results for {project_id} into the file')
+
+      sa_results_data = json.dumps(sa_results, indent=2, sort_keys=False)
 
-    sa_results_data = json.dumps(sa_results, indent=2, sort_keys=False)
+      with open(out_dir + '/%s.json' % sa_name, 'w',
+                encoding='utf-8') as outfile:
+        outfile.write(sa_results_data)
 
-    with open(out_dir + '/%s.json' % sa_name, 'w', encoding='utf-8') as outfile:
-      outfile.write(sa_results_data)
+      # Clean memory to avoid leak for large amount projects.
+      sa_results.clear()
 
 
 def iam_client_for_credentials(
@@ -313,14 +319,15 @@ def main():
   parser.add_argument(
       '-at',
       default=None,
-      dest='access_token',
-      help='Use access token directly to scan GCP resources. Limited by TTL')
+      dest='access_token_files',
+      help='A list of comma separated files with access token and OAuth scopes.\
+TTL limited. A token and scopes should be stored in JSON format.')
   parser.add_argument(
       '-rt',
       default=None,
       dest='refresh_token_files',
       help='A list of comma separated files with refresh_token, client_id,\
-token_uri and client_secret'
+token_uri and client_secret stored in JSON format.'
   )
   parser.add_argument(
       '-s', default=None, dest='key_name', help='Name of individual SA to scan')
@@ -406,18 +413,16 @@ def main():
           continue
 
         sa_tuples.append((account_name, credentials, []))
-  if args.access_token:
-    credentials = credsdb.credentials_from_token(
-        args.access_token,
-        None,
-        None,
-        None,
-        None,
-        scopes_user='https://www.googleapis.com/auth/cloud-platform')
-    if credentials is None:
-      logging.info('Failed to retrieve credentials using token provided')
-    else:
-      sa_tuples.append(('access_token_user_provided', credentials, []))
+
+  if args.access_token_files:
+    for access_token_file in args.access_token_files.split(','):
+      credentials = credsdb.creds_from_access_token(access_token_file)
+
+      if credentials is None:
+        logging.info('Failed to retrieve credentials using token provided')
+      else:
+        token_file_name = os.path.basename(refresh_token_file)
+        sa_tuples.append(('token_file_name', credentials, []))
 
   if args.refresh_token_files:
     for refresh_token_file in args.refresh_token_files.split(','):