In [1]:
import pandas as pd

# Adjusted to use local workspace CSV files
programs = pd.read_csv("programs.csv")
repos = pd.read_csv("repositories.csv")


In [2]:
iac_solutions = ["Terraform", "CDKTF", "Pulumi", "CloudFormation", "Bicep", "Ansible"]
subset = programs[programs["solution"].isin(iac_solutions)]

iac_ext = programs["directory"].str.contains(r"\.tf|cdktf\.json|Pulumi|\.bicep|\.yaml|\.yml", case=False, na=False)
subset = pd.concat([subset, programs[iac_ext]]).drop_duplicates()

merged = subset.merge(repos, left_on="repository", right_on="ID", how="left")
print("Columns:", merged.columns.tolist())
print("Shape:", merged.shape)


Columns: ['ID_x', 'repository', 'directory', 'solution', 'language', 'name_x', 'description_x', 'runtime', 'testing', 'tests', 'ID_y', 'url', 'downloaded', 'name_y', 'description_y', 'licenses', 'redistributable', 'created', 'updated', 'pushed', 'fork', 'forks', 'archive', 'programs']
Shape: (13777, 24)


In [3]:
print(merged.columns.tolist())


['ID_x', 'repository', 'directory', 'solution', 'language', 'name_x', 'description_x', 'runtime', 'testing', 'tests', 'ID_y', 'url', 'downloaded', 'name_y', 'description_y', 'licenses', 'redistributable', 'created', 'updated', 'pushed', 'fork', 'forks', 'archive', 'programs']


In [4]:
cleaned = merged[[
    "ID_y",        # repo ID
    "name_y",      # repo name
    "url",         # repo URL
    "directory",   # IaC folder inside repo
    "solution",
    "language"
]]

print("Cleaned shape:", cleaned.shape)
cleaned.head()


Cleaned shape: (13777, 6)


Unnamed: 0,ID_y,name_y,url,directory,solution,language
0,386451159,phillipedwards/pulumi-ts-import-error,https://api.github.com/repos/phillipedwards/pu...,386451159,Pulumi,typescript
1,214975348,jforge/iac-samples,https://api.github.com/repos/jforge/iac-samples,214975348/pulumi/quickstart,Pulumi,typescript
2,457090491,eric-sherrill/ahoy-pulumi,https://api.github.com/repos/eric-sherrill/aho...,457090491,Pulumi,typescript
3,453026331,RawkodeAcademy/platform,https://api.github.com/repos/RawkodeAcademy/pl...,453026331/src/index.ts,Pulumi,typescript
4,497668388,iac-factory/gitlab-runner-cdktf,https://api.github.com/repos/iac-factory/gitla...,497668388/cdktf,CDKTF,typescript


In [5]:
import os

# Create download folder
os.makedirs("terraform_repos", exist_ok=True)

# Convert API URL → clone URL
def to_clone_url(api_url):
    return api_url.replace("api.github.com/repos", "github.com") + ".git"

# Pick first 20 repos for testing (you can increase later)
subset = cleaned.head(20)

for idx, row in subset.iterrows():
    clone_url = to_clone_url(row["url"])
    repo_name = row["name_y"].replace("/", "_")  # avoid folder conflicts
    print(f"Cloning {clone_url} into terraform_repos/{repo_name}...")
    os.system(f"git clone --depth 1 {clone_url} terraform_repos/{repo_name}")


Cloning https://github.com/phillipedwards/pulumi-ts-import-error.git into terraform_repos/phillipedwards_pulumi-ts-import-error...
Cloning https://github.com/jforge/iac-samples.git into terraform_repos/jforge_iac-samples...
Cloning https://github.com/eric-sherrill/ahoy-pulumi.git into terraform_repos/eric-sherrill_ahoy-pulumi...


Cloning https://github.com/RawkodeAcademy/platform.git into terraform_repos/RawkodeAcademy_platform...


Cloning https://github.com/iac-factory/gitlab-runner-cdktf.git into terraform_repos/iac-factory_gitlab-runner-cdktf...
Cloning https://github.com/dimitor115/pulumi-instance-refresh-bug-reproduction.git into terraform_repos/dimitor115_pulumi-instance-refresh-bug-reproduction...
Cloning https://github.com/phillipedwards/aws-ts-apigw-lambda.git into terraform_repos/phillipedwards_aws-ts-apigw-lambda...
Cloning https://github.com/NewOrbit/ClientSideEncryption.Demo.git into terraform_repos/NewOrbit_ClientSideEncryption.Demo...
Cloning https://github.com/ahmadalibagheri/cdktf-typescript-aws-iam.git into terraform_repos/ahmadalibagheri_cdktf-typescript-aws-iam...


Cloning https://github.com/andrewlaskey/pulumi-example.git into terraform_repos/andrewlaskey_pulumi-example...
Cloning https://github.com/andrewlaskey/pulumi-example.git into terraform_repos/andrewlaskey_pulumi-example...
Cloning https://github.com/WaffleHacks/infrastructure.git into terraform_repos/WaffleHacks_infrastructure...
Cloning https://github.com/Pocket/firefox-android-home-recommendations.git into terraform_repos/Pocket_firefox-android-home-recommendations...


Cloning https://github.com/nojaf/ronnies.be.git into terraform_repos/nojaf_ronnies.be...
Cloning https://github.com/wjrm500/heroku-clone.git into terraform_repos/wjrm500_heroku-clone...


Cloning https://github.com/rohankumardubey/pulumi.git into terraform_repos/rohankumardubey_pulumi...
Cloning https://github.com/rohankumardubey/pulumi.git into terraform_repos/rohankumardubey_pulumi...
Cloning https://github.com/rohankumardubey/pulumi.git into terraform_repos/rohankumardubey_pulumi...
Cloning https://github.com/rohankumardubey/pulumi.git into terraform_repos/rohankumardubey_pulumi...


Cloning https://github.com/rohankumardubey/pulumi.git into terraform_repos/rohankumardubey_pulumi...


In [6]:
# If python-hcl2 is not installed, run: pip install python-hcl2


In [7]:
import os
import hcl2
import json

def find_iac_files(base_dir):
    iac_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".tf") or file.endswith(".json"):
                iac_files.append(os.path.join(root, file))
    return iac_files

def parse_tf_file(file_path):
    try:
        with open(file_path, "r") as f:
            return hcl2.load(f)  # works for .tf
    except Exception:
        try:
            with open(file_path, "r") as f:
                return json.load(f)  # fallback if JSON
        except:
            return None

# Example: parse first few IaC files from cloned repos
sample_dir = "terraform_repos"
files = find_iac_files(sample_dir)
print("Found IaC files:", files[:10])

if files:
    parsed = parse_tf_file(files[0])
    print("Parsed content (sample):", parsed)


Found IaC files: ['terraform_repos\\ahmadalibagheri_cdktf-typescript-aws-iam\\cdktf.json', 'terraform_repos\\ahmadalibagheri_cdktf-typescript-aws-iam\\package-lock.json', 'terraform_repos\\ahmadalibagheri_cdktf-typescript-aws-iam\\package.json', 'terraform_repos\\ahmadalibagheri_cdktf-typescript-aws-iam\\tsconfig.json', 'terraform_repos\\andrewlaskey_pulumi-example\\package-lock.json', 'terraform_repos\\andrewlaskey_pulumi-example\\package.json', 'terraform_repos\\andrewlaskey_pulumi-example\\tsconfig.json', 'terraform_repos\\andrewlaskey_pulumi-example\\child-infra\\package-lock.json', 'terraform_repos\\andrewlaskey_pulumi-example\\child-infra\\package.json', 'terraform_repos\\andrewlaskey_pulumi-example\\child-infra\\tsconfig.json']
Parsed content (sample): {'language': 'typescript', 'app': 'npx ts-node main.ts', 'projectId': 'e100a3f7-9187-4752-b3b5-9f5ef257eccd', 'terraformProviders': ['aws@~> 3.29.0'], 'terraformModules': [], 'context': {'excludeStackIdFromLogicalIds': 'true', 'al

In [8]:
import os
import hcl2
import json

def find_iac_files(base_dir):
    iac_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".tf") or file == "cdktf.json" or "Pulumi" in file:
                iac_files.append(os.path.join(root, file))
    return iac_files

def parse_iac_file(file_path):
    try:
        if file_path.endswith(".tf"):
            with open(file_path, "r") as f:
                return hcl2.load(f)
        elif file_path.endswith(".json") or file_path.endswith(".yaml") or file_path.endswith(".yml"):
            with open(file_path, "r") as f:
                return json.load(f)
    except Exception as e:
        return {"error": str(e), "file": file_path}
    return None

# Example: parse IaC files
sample_dir = "terraform_repos"
files = find_iac_files(sample_dir)
print("IaC files found:", files[:10])

parsed_outputs = []
for f in files[:5]:   # try first 5
    parsed_outputs.append((f, parse_iac_file(f)))

for f, content in parsed_outputs:
    print("\nFile:", f)
    print("Parsed content (trimmed):", str(content)[:500])


IaC files found: ['terraform_repos\\ahmadalibagheri_cdktf-typescript-aws-iam\\cdktf.json', 'terraform_repos\\andrewlaskey_pulumi-example\\Pulumi.parent-stack.yaml', 'terraform_repos\\andrewlaskey_pulumi-example\\Pulumi.yaml', 'terraform_repos\\andrewlaskey_pulumi-example\\child-infra\\Pulumi.child-stack.yaml', 'terraform_repos\\andrewlaskey_pulumi-example\\child-infra\\Pulumi.yaml', 'terraform_repos\\dimitor115_pulumi-instance-refresh-bug-reproduction\\Pulumi.dev.yaml', 'terraform_repos\\dimitor115_pulumi-instance-refresh-bug-reproduction\\Pulumi.yaml', 'terraform_repos\\eric-sherrill_ahoy-pulumi\\Pulumi.yaml', 'terraform_repos\\iac-factory_gitlab-runner-cdktf\\cdktf\\cdktf.json', 'terraform_repos\\iac-factory_gitlab-runner-cdktf\\cdktf\\data.tf']

File: terraform_repos\ahmadalibagheri_cdktf-typescript-aws-iam\cdktf.json
Parsed content (trimmed): {'language': 'typescript', 'app': 'npx ts-node main.ts', 'projectId': 'e100a3f7-9187-4752-b3b5-9f5ef257eccd', 'terraformProviders': ['aws@~> 

In [9]:
def extract_features(parsed):
    features = {}
    if not parsed:
        return features

    # Providers
    providers = parsed.get("terraformProviders", [])
    features["num_providers"] = len(providers)
    features["providers"] = providers

    # Modules
    modules = parsed.get("terraformModules", [])
    features["num_modules"] = len(modules)
    features["modules"] = modules

    # Language
    features["language"] = parsed.get("language", "unknown")

    return features

# Run on first 5 parsed configs
for f, content in parsed_outputs:
    if content:
        feats = extract_features(content)
        print("\nFile:", f)
        print("Features:", feats)



File: terraform_repos\ahmadalibagheri_cdktf-typescript-aws-iam\cdktf.json
Features: {'num_providers': 1, 'providers': ['aws@~> 3.29.0'], 'num_modules': 0, 'modules': [], 'language': 'typescript'}

File: terraform_repos\andrewlaskey_pulumi-example\Pulumi.parent-stack.yaml
Features: {'num_providers': 0, 'providers': [], 'num_modules': 0, 'modules': [], 'language': 'unknown'}

File: terraform_repos\andrewlaskey_pulumi-example\Pulumi.yaml
Features: {'num_providers': 0, 'providers': [], 'num_modules': 0, 'modules': [], 'language': 'unknown'}

File: terraform_repos\andrewlaskey_pulumi-example\child-infra\Pulumi.child-stack.yaml
Features: {'num_providers': 0, 'providers': [], 'num_modules': 0, 'modules': [], 'language': 'unknown'}

File: terraform_repos\andrewlaskey_pulumi-example\child-infra\Pulumi.yaml
Features: {'num_providers': 0, 'providers': [], 'num_modules': 0, 'modules': [], 'language': 'unknown'}


In [10]:
import pandas as pd

# Load metadata (local CSVs)
programs = pd.read_csv("programs.csv")
repos = pd.read_csv("repositories.csv")

# 1. Select by known IaC solutions
iac_solutions = ["Terraform", "CDKTF", "Pulumi", "CloudFormation", "Bicep", "Ansible"]
subset = programs[programs["solution"].isin(iac_solutions)]

# 2. Select by file extensions (for safety)
iac_ext = programs["directory"].str.contains(r"\.tf|cdktf\.json|Pulumi|\.bicep|\.yaml|\.yml", case=False, na=False)

# 3. Merge both and deduplicate
subset = pd.concat([subset, programs[iac_ext]]).drop_duplicates()

# 4. Merge with repos metadata
merged = subset.merge(repos, left_on="repository", right_on="ID", how="left")

# 5. Get unique repos
unique_repos = merged.drop_duplicates(subset=["repository"])
print("Unique IaC repos:", unique_repos.shape[0])

# 6. If repos < 5000, just take all
if unique_repos.shape[0] >= 5000:
    sampled_repos = unique_repos.sample(n=5000, random_state=42)
else:
    sampled_repos = unique_repos  # take all available
    print(f"Only {unique_repos.shape[0]} repos available, taking all.")

# Save list
sampled_repos[["repository", "name_y", "url", "directory"]].to_csv("iac_sampled.csv", index=False)
print("Saved sampled repo list to iac_sampled.csv")


Unique IaC repos: 3885
Only 3885 repos available, taking all.
Saved sampled repo list to iac_sampled.csv


In [11]:
print(unique_repos["solution"].value_counts())


solution
Pulumi    3366
CDKTF      519
Name: count, dtype: int64


In [12]:
import os, subprocess, pandas as pd

# Load repo list
repos = pd.read_csv("iac_sampled.csv")

# Create folder for clones
os.makedirs("iac_repos", exist_ok=True)

# Clone first 50 repos as a test (safe for Review-2)
for i, row in repos.head(50).iterrows():
    repo_url = row["url"].replace("api.github.com/repos", "github.com")  # fix API → git URL
    repo_name = row["name_y"].replace("/", "_")
    target_dir = f"iac_repos/{repo_name}"

    if not os.path.exists(target_dir):
        try:
            print(f"Cloning {repo_url}...")
            subprocess.run(["git", "clone", repo_url, target_dir], check=True)
        except Exception as e:
            print(f"Failed {repo_url}: {e}")


Cloning https://github.com/RawkodeAcademy/platform...


Failed https://github.com/RawkodeAcademy/platform: Command '['git', 'clone', 'https://github.com/RawkodeAcademy/platform', 'iac_repos/RawkodeAcademy_platform']' returned non-zero exit status 128.
Cloning https://github.com/JIO93/Programming-With-Python-For-DevOps-Engineers-Bootcamp...


Failed https://github.com/JIO93/Programming-With-Python-For-DevOps-Engineers-Bootcamp: Command '['git', 'clone', 'https://github.com/JIO93/Programming-With-Python-For-DevOps-Engineers-Bootcamp', 'iac_repos/JIO93_Programming-With-Python-For-DevOps-Engineers-Bootcamp']' returned non-zero exit status 128.
Cloning https://github.com/nyamada43/demoPulumi...


Failed https://github.com/nyamada43/demoPulumi: Command '['git', 'clone', 'https://github.com/nyamada43/demoPulumi', 'iac_repos/nyamada43_demoPulumi']' returned non-zero exit status 128.


In [13]:
import os, subprocess, pandas as pd, time

# Load repo list (already filtered IaC repos)
repos = pd.read_csv("iac_sampled.csv")

# Create folder to store repos
os.makedirs("iac_repos", exist_ok=True)

# Define the clone function again
def clone_repo(url, name):
    repo_url = url.replace("api.github.com/repos", "github.com")
    target_dir = f"iac_repos/{name.replace('/', '_')}"
    if os.path.exists(target_dir):
        print(f"Already exists: {name}")
        return

    try:
        print(f"Cloning {repo_url}...")
        subprocess.run(
            ["git", "clone", "--depth", "1", repo_url, target_dir],
            check=True
        )
    except Exception as e:
        print(f"Failed {repo_url}: {e}")

# Batch clone only 1000 repos (safe for Review 2)
for i, row in repos.iloc[:1000].iterrows():
    clone_repo(row["url"], row["name_y"])

    if i % 200 == 0 and i > 0:
        print("Pausing to avoid GitHub block...")
        time.sleep(60)  # 1 min pause every 200 repos


Already exists: phillipedwards/pulumi-ts-import-error
Already exists: jforge/iac-samples
Already exists: eric-sherrill/ahoy-pulumi
Cloning https://github.com/RawkodeAcademy/platform...


Failed https://github.com/RawkodeAcademy/platform: Command '['git', 'clone', '--depth', '1', 'https://github.com/RawkodeAcademy/platform', 'iac_repos/RawkodeAcademy_platform']' returned non-zero exit status 128.
Already exists: iac-factory/gitlab-runner-cdktf
Already exists: dimitor115/pulumi-instance-refresh-bug-reproduction
Already exists: phillipedwards/aws-ts-apigw-lambda
Already exists: NewOrbit/ClientSideEncryption.Demo
Already exists: ahmadalibagheri/cdktf-typescript-aws-iam
Already exists: andrewlaskey/pulumi-example
Already exists: WaffleHacks/infrastructure
Already exists: Pocket/firefox-android-home-recommendations
Already exists: nojaf/ronnies.be
Already exists: wjrm500/heroku-clone
Already exists: rohankumardubey/pulumi
Already exists: alek29c/tf
Already exists: AdminTurnedDevOps/Live-Sessions-And-Conferences
Already exists: andrekiba/FantasticBike
Already exists: PHONGLEX/fastapi_pulumi_practice
Already exists: cyrano5614/pulumi-gcp-network
Already exists: hossambarakat/p

Failed https://github.com/JIO93/Programming-With-Python-For-DevOps-Engineers-Bootcamp: Command '['git', 'clone', '--depth', '1', 'https://github.com/JIO93/Programming-With-Python-For-DevOps-Engineers-Bootcamp', 'iac_repos/JIO93_Programming-With-Python-For-DevOps-Engineers-Bootcamp']' returned non-zero exit status 128.
Already exists: matrixbegins/uni-demo-deploy
Already exists: Shyam-Chen/Micro-Fullstack
Already exists: jaxxstorm/pulumi-automationapi-workshop
Already exists: cloudspeak/pulumi-lambda-efs
Already exists: aeons/aeons.dk
Already exists: mitodl/ol-infrastructure
Already exists: just-run-my/app-azure-edition
Cloning https://github.com/nyamada43/demoPulumi...


Failed https://github.com/nyamada43/demoPulumi: Command '['git', 'clone', '--depth', '1', 'https://github.com/nyamada43/demoPulumi', 'iac_repos/nyamada43_demoPulumi']' returned non-zero exit status 128.
Already exists: skorfmann/cdktf-provider-aws
Already exists: pulumi/pulumi-azure-quickstart-compute
Already exists: joeduffy/pulumi-awsvpc
Already exists: pulumi/pulumi-mailgun
Already exists: melihyazgan/IaC_pipeline
Already exists: EwanNoble/AppCatalog
Already exists: ikovac/pulumi-workshop-1
Already exists: nmora53/async-power-dialer-stone-poc
Already exists: artcoded-net/vendure-pulumi-gcp
Already exists: hmix/try-terraform-cdk-python
Already exists: jessicaUP/CloudVisitCount
Cloning https://github.com/strongishllama/cdktf-constructs...


Failed https://github.com/strongishllama/cdktf-constructs: Command '['git', 'clone', '--depth', '1', 'https://github.com/strongishllama/cdktf-constructs', 'iac_repos/strongishllama_cdktf-constructs']' returned non-zero exit status 128.
Already exists: pulumi/pulumi-mysql
Already exists: brightsole/solosis
Already exists: L0ndra/pulumi-do-ddos
Already exists: neilkuan/pulumi-cdk-example
Cloning https://github.com/somandr/ISaC...


Failed https://github.com/somandr/ISaC: Command '['git', 'clone', '--depth', '1', 'https://github.com/somandr/ISaC', 'iac_repos/somandr_ISaC']' returned non-zero exit status 128.
Already exists: einfachnuralex/gdeploy
Already exists: abatilo/grpc-timeout-repro
Already exists: korosuke613/playground
Already exists: BelRarr/confoo2022-iac4devs
Already exists: devopsjourney1/pulumi-helloworld
Already exists: lmayorga1980/gcp-cdktf-example
Already exists: DanielMSchmidt/cdktf-multi-stack-tfe
Already exists: alfredoem/pulumi-js-aws-example
Already exists: paiml/python_devops_book
Already exists: nicholas-yong/Personal-Website
Already exists: Tirke/try-pulumi
Already exists: blakegreendev/pulumi-aws-django-voting-app
Already exists: cloudreach/urly-wurly
Already exists: Lonero-Team/Decentralized-Internet
Already exists: SamEdwardes/rstudio-team-pulumi-recipes
Already exists: tyrchen/pulumi-examples
Already exists: gbaeke/pulumi-samples
Cloning https://github.com/poikaa/iac-research...


Failed https://github.com/poikaa/iac-research: Command '['git', 'clone', '--depth', '1', 'https://github.com/poikaa/iac-research', 'iac_repos/poikaa_iac-research']' returned non-zero exit status 128.
Already exists: amielkov/test-pulumi
Already exists: dahendel/pulumi-vsphere-k3s
Cloning https://github.com/Nolan01m/CI-Project-CCDC...


Failed https://github.com/Nolan01m/CI-Project-CCDC: Command '['git', 'clone', '--depth', '1', 'https://github.com/Nolan01m/CI-Project-CCDC', 'iac_repos/Nolan01m_CI-Project-CCDC']' returned non-zero exit status 128.
Already exists: amkul99/pulumi-fargate
Already exists: cmatskas/pulumi-aad-demo
Already exists: mizzy/pulumi-playground
Already exists: cfeenstra67/egghead
Already exists: yoshieki1992-wk/gke
Already exists: serverless-architecture/reference-architectures
Already exists: jaxxstorm/pulumi-productionapp
Already exists: nullify005/HomeAssistant
Already exists: jslopeza/nx-cdktf-sls-experiment
Already exists: AI2Incubator/AI2I_streamlitapp
Cloning https://github.com/alisavch/iac-image-service...


Failed https://github.com/alisavch/iac-image-service: Command '['git', 'clone', '--depth', '1', 'https://github.com/alisavch/iac-image-service', 'iac_repos/alisavch_iac-image-service']' returned non-zero exit status 128.
Already exists: wrasdf/pulumi-world
Already exists: wongcyrus/AzureCloudLabEnvironment
Already exists: GopinathYadavAR/pulumi-workshop
Cloning https://github.com/zhifanz/fanqiang-pulumi-python...


Failed https://github.com/zhifanz/fanqiang-pulumi-python: Command '['git', 'clone', '--depth', '1', 'https://github.com/zhifanz/fanqiang-pulumi-python', 'iac_repos/zhifanz_fanqiang-pulumi-python']' returned non-zero exit status 128.
Already exists: earafat-oak9/sample-terraform
Already exists: NorthStNetworking/NorthStNetworking
Already exists: pulumi/pulumi-policy-opa
Already exists: displague/pulumi-linode-webserver-ts
Already exists: mtrcn/georchestrator
Already exists: wallabyjs/wallaby-pulumi
Already exists: staslebedenko/perfect-infra-via-pulumi
Already exists: Silver-birder/social-trend
Already exists: pulumi-in-action/website
Already exists: goldoneen/nft-marketplace
Already exists: staeff/aws-infra
Already exists: clstokes/pulumi-aws-ts-ecs-existing-vpc
Cloning https://github.com/rmavuluri/terraform-docker...


Failed https://github.com/rmavuluri/terraform-docker: Command '['git', 'clone', '--depth', '1', 'https://github.com/rmavuluri/terraform-docker', 'iac_repos/rmavuluri_terraform-docker']' returned non-zero exit status 128.
Cloning https://github.com/swapnilmmane/ecommerce...


Failed https://github.com/swapnilmmane/ecommerce: Command '['git', 'clone', '--depth', '1', 'https://github.com/swapnilmmane/ecommerce', 'iac_repos/swapnilmmane_ecommerce']' returned non-zero exit status 128.
Already exists: WaryArchanid/Events
Already exists: petabridge/AkkaDotNet.LargeNetworkTests
Already exists: sturlath/MyTest
Already exists: gmiretti/pulumi-aws-py-lambda-api-gateway-ecache
Already exists: sdrush-mw/sr-cdktf-test
Cloning https://github.com/slimdevl/pulumi-ssm-example...


Failed https://github.com/slimdevl/pulumi-ssm-example: Command '['git', 'clone', '--depth', '1', 'https://github.com/slimdevl/pulumi-ssm-example', 'iac_repos/slimdevl_pulumi-ssm-example']' returned non-zero exit status 128.
Already exists: jtrinklein/trinkle.in-site
Already exists: codedevote/spartakiade2021-pulumi
Already exists: azul915/cron-serverless-pulumi
Already exists: Mauwii/azure-pipelines-with-github-repo
Already exists: pulumi/pulumi-aiven
Already exists: rjha0720/cdkterraform
Already exists: state-alchemists/zaruba
Already exists: hesbon-osoro/pulumi-gcp
Already exists: pulumi/pulumi-databricks
Already exists: shtakai/poc-pulumi-aws
Cloning https://github.com/devnullroots/k3d_playground...


Failed https://github.com/devnullroots/k3d_playground: Command '['git', 'clone', '--depth', '1', 'https://github.com/devnullroots/k3d_playground', 'iac_repos/devnullroots_k3d_playground']' returned non-zero exit status 128.
Already exists: ServiTrace/ReplicationPackage
Already exists: xai1983kbu/pulumi_appsync_with_direct_lambda_to_dynamodb
Already exists: redhwannacef/youtube-tutorials
Already exists: wanghao1891/sample
Cloning https://github.com/dhairya137/heroku-clone...


Failed https://github.com/dhairya137/heroku-clone: Command '['git', 'clone', '--depth', '1', 'https://github.com/dhairya137/heroku-clone', 'iac_repos/dhairya137_heroku-clone']' returned non-zero exit status 128.
Already exists: sankarpa/iac
Already exists: danieldspx/pulumi-ecs-example
Already exists: benesch/pulumi-bug-eks
Already exists: cpayret-KDM/KDM_API
Already exists: neilkuan/cdk-terraform-aws-ecs-fargate
Already exists: luizhlelis/auth0-IaC
Already exists: blakegreendev/pulumi-lambda-workshop
Already exists: c0bra/github-repo-freshness-api
Already exists: joeduffy/pk8syaml
Already exists: dirien/digitalocean-kubernetes-challenge
Already exists: alexhwoods/super-pulumi
Already exists: robertkotcher/robertkotcher.github.io
Already exists: Kannakattisanjana/IRA-Arrant
Already exists: sammck/apihub
Already exists: janfabian/offtopic-slack-bot
Already exists: sgrade/pulumi
Already exists: SzymonSmykala/Pulumi-micro-stacks-csharp
Already exists: squaremo/pulumi-ociregistry-provider


Failed https://github.com/NovatecConsulting/dot-translathor: Command '['git', 'clone', '--depth', '1', 'https://github.com/NovatecConsulting/dot-translathor', 'iac_repos/NovatecConsulting_dot-translathor']' returned non-zero exit status 128.
Already exists: rarous/rarousnet
Already exists: CharlieDigital/dn6-mongo-react-valtio
Already exists: wyx2014/yb.dev
Already exists: scaffold-sh/aws-serverless-docker
Already exists: levankhelo/pullantis
Already exists: henriklagergren/azure-trigger-benchmark
Already exists: Shatabdi2621/AzureResource---Pulumi
Already exists: andrekiba/CrazyBike
Already exists: ChristoMI/Web_Backend
Already exists: pulumiverse/pulumi-aws-eksa
Already exists: Nav-app/pulumiAwsS3LambdaDynamoDB
Already exists: L03TJ3/gd-gc-hc
Already exists: skill-collectors/agile-poker
Already exists: mysticrenji/pulumi-yaml
Already exists: pulumi/pulumi-snowflake
Already exists: goawsgo/aws-ekk-pulumi
Already exists: mskutin/lambda-nodejs-16
Already exists: romcheck/pulumi-stackgre

Failed https://github.com/sol-eng/pulumi-recipes: Command '['git', 'clone', '--depth', '1', 'https://github.com/sol-eng/pulumi-recipes', 'iac_repos/sol-eng_pulumi-recipes']' returned non-zero exit status 128.
Already exists: blakegreendev/vscode-fargate-pulumi
Already exists: pulumi/pulumi-dnsimple
Already exists: wgarcia79/pulumi-aws
Already exists: DavidLeong98/pulumi
Already exists: eesvara5512/terraform-samples
Already exists: DanielMSchmidt/terraform-cdk-terraform-module-publishing-on-gh-packages
Already exists: ahmadalibagheri/cdktf-typescript-aws-vpc
Already exists: lwooden/sample-node-api-service
Already exists: yanniz0r/sonq
Already exists: CerusBots/website
Already exists: ptavoni/forgeops
Already exists: cobraz/pulumi-wordpress
Already exists: vincetiu8/ee-project
Already exists: nukdcbear/Pulumi-Python-AWS-VPC-EC2
Already exists: aureq/pulumi-aws-ts-training-session-3
Already exists: AustinWise/GrpcMicroservicesOnGoogleCloudRun
Pausing to avoid GitHub block...


Already exists: fjfdepedro/aws-pulumi-ror
Already exists: shabados/viewer
Already exists: whs-dot-hk/cdktf-test
Already exists: AaronFriel/awsx-fargate-deps
Already exists: quebic-source/aws-data-migration
Already exists: phillipedwards/aws-plugin-issue
Already exists: perlin-network/perlinx-dashboard
Already exists: paulrobello/localstack-lambda-layer-bug
Already exists: pulumi/pulumi-yaml
Already exists: ramyaparthiban31/lender
Already exists: pulumi/pulumi-hcloud
Already exists: ReviakinAleksey/ls-example
Already exists: ahmadalibagheri/cdktf-typescript-aws-ec2
Already exists: spaceoddite/pulumi-azure-vm-component-resource-python
Already exists: Othello1111/Decentralized_Internet
Already exists: LachlanStevens/UntitledStack-3
Already exists: dave-burke/rpn-calc
Already exists: icgam/Pulumi.Samples
Already exists: shazi7804/cdktf-samples-python
Already exists: ksivamuthu/eks-pulumi-demo
Already exists: ntonjeta/pulumi_integration_examples
Already exists: pulumi/pulumi
Already exists:

Failed https://github.com/pahud/cdktf-aws-eks: Command '['git', 'clone', '--depth', '1', 'https://github.com/pahud/cdktf-aws-eks', 'iac_repos/pahud_cdktf-aws-eks']' returned non-zero exit status 128.
Already exists: jaxxstorm/brig.gs
Already exists: BITOCTA/diagrams-article-example
Already exists: tmeadon/clippings
Cloning https://github.com/ysmoradi/Bit.TodoTemplatePlayground...


Failed https://github.com/ysmoradi/Bit.TodoTemplatePlayground: Command '['git', 'clone', '--depth', '1', 'https://github.com/ysmoradi/Bit.TodoTemplatePlayground', 'iac_repos/ysmoradi_Bit.TodoTemplatePlayground']' returned non-zero exit status 128.
Already exists: sasidiropoulos/market
Cloning https://github.com/endymion1818/quantumleap-online...


Failed https://github.com/endymion1818/quantumleap-online: Command '['git', 'clone', '--depth', '1', 'https://github.com/endymion1818/quantumleap-online', 'iac_repos/endymion1818_quantumleap-online']' returned non-zero exit status 128.
Cloning https://github.com/ChanghoonHyun/cdktf-example...


Cloning https://github.com/harrismcc/monorepo-starter...


Cloning https://github.com/Welsh-Boogie/jkdkajsdnkkkpmplkqwe...


Cloning https://github.com/phanimullapudi/pulumi-learning...


Cloning https://github.com/noemiko/iac_examples...


Cloning https://github.com/kaiquelupo/preview-dialer...


Cloning https://github.com/dgg32/pulumi_quickstart...


Cloning https://github.com/bencematyasi/nginx-aws-demo...


Cloning https://github.com/simonzhow/hello-pulumi...


Cloning https://github.com/provenian/judge...


Cloning https://github.com/pulumi/qcon-workshop...


Cloning https://github.com/fuluteam/honeycomb...


Cloning https://github.com/cveld/pulumi-tester...


Cloning https://github.com/vaspoz/pgadmin-aws...


Cloning https://github.com/thakkaryash94/terraform-cdk-react-example...


Cloning https://github.com/jjgrinwis/pulumi-property...


Cloning https://github.com/davidNHK/project-bootstrap...


Cloning https://github.com/gravitee-io/gravitee-fargate...


Failed https://github.com/gravitee-io/gravitee-fargate: Command '['git', 'clone', '--depth', '1', 'https://github.com/gravitee-io/gravitee-fargate', 'iac_repos/gravitee-io_gravitee-fargate']' returned non-zero exit status 128.
Cloning https://github.com/jsdeveloperpro/nextjs-gcp-storage...


Cloning https://github.com/jaxxstorm/pulumi-secure-eks-workloads...


Cloning https://github.com/steinko/AIMTutorial...


Cloning https://github.com/ddunkin/pulumi-docker...


Cloning https://github.com/pulumi/pulumi-azure-quickstart-acr-geo-replication...


Cloning https://github.com/BoredTweak/Miscellaneous...


Cloning https://github.com/yyvess/kubpoc...


Cloning https://github.com/zack-schrag/snap-secret...


Cloning https://github.com/Vake93/cca.server...


Cloning https://github.com/gobengo/activitypub.com...


Cloning https://github.com/LeonidChetverikov/pulumi...


Cloning https://github.com/mikhailshilkov/fsharp-advent-pulumi...


Cloning https://github.com/cheab-org/infra...


Cloning https://github.com/defn/cdktf-provider-cloudflare...


Failed https://github.com/defn/cdktf-provider-cloudflare: Command '['git', 'clone', '--depth', '1', 'https://github.com/defn/cdktf-provider-cloudflare', 'iac_repos/defn_cdktf-provider-cloudflare']' returned non-zero exit status 128.
Cloning https://github.com/ameiji/pulumi-demo...


Failed https://github.com/ameiji/pulumi-demo: Command '['git', 'clone', '--depth', '1', 'https://github.com/ameiji/pulumi-demo', 'iac_repos/ameiji_pulumi-demo']' returned non-zero exit status 128.
Cloning https://github.com/elbetasal/app-playground...


Cloning https://github.com/shraddhamkandpal/healthrecord-verifier...


Cloning https://github.com/cloudvietnam18/git-pulumi...


Cloning https://github.com/phillipedwards/azure-dashboard-error...


Cloning https://github.com/dailydotdev/daily-monetization...


Cloning https://github.com/clstokes/example-multicloud-webserver...


Cloning https://github.com/lakindu2002/pulumi-micro-stacks...


Cloning https://github.com/ederst/pulumi-in-a-pickle...


Cloning https://github.com/Pocket/curated-corpus-api...


Cloning https://github.com/gabrielstellini/pulumi-playground...


Cloning https://github.com/Pjatac/KubTest...


Cloning https://github.com/s1ntaxe770r/do-pl-k8s...


Cloning https://github.com/tonskey/my-instagram-web...


Failed https://github.com/tonskey/my-instagram-web: Command '['git', 'clone', '--depth', '1', 'https://github.com/tonskey/my-instagram-web', 'iac_repos/tonskey_my-instagram-web']' returned non-zero exit status 128.
Cloning https://github.com/jeanpaulsmit/blog-posts...


Cloning https://github.com/foundry-infra/foundry-infra-cs...


Cloning https://github.com/dixler/jdconf-2022-platform-stack...


Cloning https://github.com/RichardWLaub/pulumi-repro...


Cloning https://github.com/sarwe100/dotnetprod...


Cloning https://github.com/unmango/safir-infra...


Cloning https://github.com/swgillespie/dockercon18...


Cloning https://github.com/macneib/cicd...


Cloning https://github.com/canveshh1/eks-01...


Cloning https://github.com/nszaaa3/shididev.com...


Cloning https://github.com/aaronaddleman/cdktf-typescript-docker...


Cloning https://github.com/rumsrami/drone-ecs...


Cloning https://github.com/jaxxstorm/containers-from-the-couch...


Cloning https://github.com/iiglesiasg/elastic-pulumi...


Cloning https://github.com/byhbt/cdktf-python...


Cloning https://github.com/natmarek/cdktf-learning...


Cloning https://github.com/pedramha/cdktf-azureappservice...


Cloning https://github.com/cpaton/aws-configuration...


Cloning https://github.com/creatorsgarten/configuration...


Cloning https://github.com/exobase-inc/exobase-stack-builders...


Cloning https://github.com/DanManson/gcp.iamCustomRole...


Cloning https://github.com/codegram/pulumi-dnsimple...


Cloning https://github.com/eminetto/post-pulumi...


Cloning https://github.com/philjhale/pulumi-gcp...


Cloning https://github.com/JulesP96/Sample-Pulumi...


Cloning https://github.com/camilovarela/jenkins-test...


Cloning https://github.com/MitchellGerdisch/pulumi-codefresh...


Cloning https://github.com/unmango/pulumi-homelab...


Cloning https://github.com/martinjt/pulumi-dotnet5...


Cloning https://github.com/codegram/regalocal...


Cloning https://github.com/DanielMSchmidt/cdktf-github-actions-local-demo...


Cloning https://github.com/David-VTUK/pulumi-rancher-demos...


Cloning https://github.com/gs-gs/pulumi-library...


Cloning https://github.com/dvargas92495/terraform-aws-clerk...


Cloning https://github.com/verygoodsoftwareorg/PulumiReactSSRExample...


Failed https://github.com/verygoodsoftwareorg/PulumiReactSSRExample: Command '['git', 'clone', '--depth', '1', 'https://github.com/verygoodsoftwareorg/PulumiReactSSRExample', 'iac_repos/verygoodsoftwareorg_PulumiReactSSRExample']' returned non-zero exit status 128.
Cloning https://github.com/mikhailshilkov/fosdem2019...


Cloning https://github.com/mattstratton/awesome-azure...


Cloning https://github.com/CliffJumper/pulumi-py-tf2-server...


Cloning https://github.com/ryuheechul/tf-cloud...


Cloning https://github.com/kenshoo/devops-bot...


Failed https://github.com/kenshoo/devops-bot: Command '['git', 'clone', '--depth', '1', 'https://github.com/kenshoo/devops-bot', 'iac_repos/kenshoo_devops-bot']' returned non-zero exit status 128.
Cloning https://github.com/ikedaosushi/python-sandbox...


Failed https://github.com/ikedaosushi/python-sandbox: Command '['git', 'clone', '--depth', '1', 'https://github.com/ikedaosushi/python-sandbox', 'iac_repos/ikedaosushi_python-sandbox']' returned non-zero exit status 128.
Cloning https://github.com/kmkatsma/pulumi-az-xwalk...


Cloning https://github.com/markeytos/PulumiAzureSample...


Cloning https://github.com/Aletrevi/Postchain...


Cloning https://github.com/Bharathkumarraju/gcp_for_fun...


Cloning https://github.com/swapnilmmane/cicd...


Cloning https://github.com/roylee0704/hello-webiny...


Cloning https://github.com/codegram/dashy...


Cloning https://github.com/ringge/DevSecOps...


Cloning https://github.com/ever-co/dev-pulumi...


Cloning https://github.com/nmora53/PowerDialer_quero_educ...


Cloning https://github.com/yoshikyoto/cdktf-sample-app...


Cloning https://github.com/davetrainer/api-container-app...


Cloning https://github.com/blampe/rocketpool-pulumi...


Cloning https://github.com/anandvimal/pulumi-examples...


Cloning https://github.com/epomatti/az-static-webapps-pulumi...


Cloning https://github.com/adekoyadapo/managed-k8s...


Cloning https://github.com/OctopusSamples/pulumi-python-azure...


Cloning https://github.com/pulumi/pulumi-rke...


Cloning https://github.com/RainRush/generator-react-ci-iac...


Cloning https://github.com/dgg32/ncbi-taxonomy-pulumi...


Cloning https://github.com/soulasvalentin/pulumi-poc...


Cloning https://github.com/kksudo/devoops-cdktf...


Cloning https://github.com/JakeGinnivan/pulumi-ecs-debugging...


Cloning https://github.com/jdwelch/examples...


Cloning https://github.com/leapingbytes/kenesis-or-bust...


Cloning https://github.com/ShengzhenFu/azure-pulumi-demo...


Cloning https://github.com/chrsmith/pulumi-aws-travis-cicd-demo...


Cloning https://github.com/acald-creator/deploy-sigstore-gcp...


Failed https://github.com/acald-creator/deploy-sigstore-gcp: Command '['git', 'clone', '--depth', '1', 'https://github.com/acald-creator/deploy-sigstore-gcp', 'iac_repos/acald-creator_deploy-sigstore-gcp']' returned non-zero exit status 128.
Cloning https://github.com/krsche/scratchpad...


Cloning https://github.com/AaronFriel/awsx-example...


Cloning https://github.com/janegilring/pspulumidemo...


Cloning https://github.com/clstokes/pulumi-policy-aws-iam-accessanalyzer...


Cloning https://github.com/pavan8855/enthire...


Cloning https://github.com/etherdata-blockchain/config...


Cloning https://github.com/chrisguest75/terraform_examples...


Cloning https://github.com/aochagavia/buy-me-a-beer...


Failed https://github.com/aochagavia/buy-me-a-beer: Command '['git', 'clone', '--depth', '1', 'https://github.com/aochagavia/buy-me-a-beer', 'iac_repos/aochagavia_buy-me-a-beer']' returned non-zero exit status 128.
Cloning https://github.com/wwt/pulumi-templates...


Cloning https://github.com/lordabhi1/cdk-for-terraform...


Cloning https://github.com/hpcjmart/ejemplo_1...


Cloning https://github.com/Yarel/IaC-with-Pulumi...


Cloning https://github.com/marcindulak/azure-ml-aks-intro...


Cloning https://github.com/decentraland/documentation...


Cloning https://github.com/Kurabu-chan/Kurabu...


Cloning https://github.com/Eric-Swiftly/PulumiCDNEndpointIssue...


Cloning https://github.com/muellermatthias/blog-pulumi-modules...


Cloning https://github.com/RahulMR42/oci-pulumi-java-objectstore...


Cloning https://github.com/stack72/aws-stackreference-architecture...


Cloning https://github.com/justinvp/templates...


Cloning https://github.com/pg94au/TerraformGoal...


Cloning https://github.com/ihoro-epam/cloudx-aws...


Cloning https://github.com/mpnsk/xingedin...


Cloning https://github.com/jstans/kubernetes-dashboard...


Cloning https://github.com/abvijaykumar/contactlist-blog-infra...


Cloning https://github.com/AlfredoPardo-zz/python-for-devsecops...


Cloning https://github.com/mastoj/sweetspot...


Cloning https://github.com/pierskarsenbarg/static-website-component...


Cloning https://github.com/peytoncasper/tf-cdk-examples...


Cloning https://github.com/DanielMSchmidt/cdktf-aws-networking-demo...


Cloning https://github.com/FriendlyUser/stock-pulumi...


Cloning https://github.com/discovery-chander/pulumi...


Cloning https://github.com/imod/azure-nextgen-migration...


Cloning https://github.com/lolverae/azure-infra-with-pulumi...


Failed https://github.com/lolverae/azure-infra-with-pulumi: Command '['git', 'clone', '--depth', '1', 'https://github.com/lolverae/azure-infra-with-pulumi', 'iac_repos/lolverae_azure-infra-with-pulumi']' returned non-zero exit status 128.
Cloning https://github.com/rv-terraform/cdktf-test...


Cloning https://github.com/MarcinGadz/iaac-comparision...


Cloning https://github.com/stack72/pulumi-gitlab-runner...


Cloning https://github.com/sammck/cloud-dev-box...


Cloning https://github.com/twilio-infra-as-code/taskrouter-contact-center...


Cloning https://github.com/hashicorp/cdktf-provider-random...


Cloning https://github.com/DevOpsRandD/iac_cdk_pulumi_terraform...


Pausing to avoid GitHub block...


Cloning https://github.com/vumdao/aws-guardduty-sechub...


Cloning https://github.com/shabados/infrastructure...


Cloning https://github.com/nidhiben/terraform-typescript...


Cloning https://github.com/s4nchez/storage-example...


Cloning https://github.com/ludesdeveloper/pulumi-managed-nodes-autoscale-eks...


Cloning https://github.com/fsainovich/tcb-azure-m6...


Cloning https://github.com/jguadagno/jjgnet-broadcast...


Cloning https://github.com/khaiyik2612/PulumiCICS...


Failed https://github.com/khaiyik2612/PulumiCICS: Command '['git', 'clone', '--depth', '1', 'https://github.com/khaiyik2612/PulumiCICS', 'iac_repos/khaiyik2612_PulumiCICS']' returned non-zero exit status 128.
Cloning https://github.com/milanpollock/dark-rush-photography...


Cloning https://github.com/Kapernikov/skaffold-helm-tutorial...


Cloning https://github.com/Pocket/recommendation-api...


KeyboardInterrupt: 

In [26]:
import shutil, os

base = "iac_repos"
remove_dirs = ["node_modules", ".git", "vendor", "__pycache__"]

for repo in os.listdir(base):
    for root, dirs, files in os.walk(os.path.join(base, repo)):
        for d in dirs:
            if d in remove_dirs:
                path = os.path.join(root, d)
                try:
                    shutil.rmtree(path)
                    print("Removed", path)
                except Exception as e:
                    pass


Removed iac_repos/gobengo_activitypub.com/.git
Removed iac_repos/rgl_dotnet-pulumi-libvirt-rke-example/.git
Removed iac_repos/sinanbekar_turborepo-terraform-cdk-aws-vercel/.git
Removed iac_repos/paiml_python_devops_book/.git
Removed iac_repos/paiml_python_devops_book/src/chap13-Serverless/google-python-simple-http-endpoint/node_modules
Removed iac_repos/phillipedwards_pulumi-ts-import-error/.git
Removed iac_repos/cjh-cloud_esp32_pwa_starter/.git
Removed iac_repos/Addono_guide-to-pulumi-demo--common-infra/.git
Removed iac_repos/JessicaRudd_mlplatform-infra/.git
Removed iac_repos/pierskarsenbarg_aws-iam-demo/.git
Removed iac_repos/Ishanamgai_NFT-DApp/.git
Removed iac_repos/Ishanamgai_NFT-DApp/webapp/src/modules/vendor
Removed iac_repos/MarcinGadz_iaac-comparision/.git
Removed iac_repos/w9n_pulumi-kind/.git
Removed iac_repos/sthuck_duckduckstay/.git
Removed iac_repos/hashicorp_cdktf-provider-random/.git
Removed iac_repos/DanielMSchmidt_cdktf-multi-stack-tfe/.git
Removed iac_repos/anonymou

In [27]:
import os, shutil, random

repos = os.listdir("iac_repos")
print("Total repos available:", len(repos))

# Pick all available repos (or up to 1000 if you have more)
sample_size = min(1000, len(repos))
sample = random.sample(repos, sample_size)

subset_dir = "iac_subset"
os.makedirs(subset_dir, exist_ok=True)

# Directories we want to ignore (junk)
ignore_dirs = ["node_modules", ".git", "venv", "__pycache__", "dist", "build"]

def ignore_patterns(_, names):
    return [n for n in names if n in ignore_dirs]

for r in sample:
    src = os.path.join("iac_repos", r)
    dst = os.path.join(subset_dir, r)
    if not os.path.exists(dst):
        try:
            shutil.copytree(src, dst, ignore=ignore_patterns)
            print("Copied:", r)
        except Exception as e:
            print("Failed:", r, e)



Total repos available: 936
Copied: sgrade_pulumi
Copied: Jmen_hello-pulumi
Copied: elbetasal_app-playground
Copied: Ishanamgai_NFT-DApp
Copied: justinvp_secret-outputs-repro
Copied: pulumi_pulumi
Copied: vavsab_pulumi-aiven-import-issue
Copied: pulumi_pulumi-confluentcloud
Copied: rohankumardubey_kubernetes-operator
Copied: lindydonna_velocity-examples
Copied: donbing_Pairy
Copied: syahiaoui_pulumi-deploy-clourun
Copied: bloominlabs_baseplate-go
Copied: martinothamar_EasyAzureWebApp
Copied: kubarozycki_infra-as-a-code-exercise
Copied: squaremo_pulumi-ociregistry-provider
Copied: pulumi_pulumi-auth0
Copied: kingster307_localstack_ws_bugs
Copied: andrekiba_CrazyBike
Copied: xiehan_cdktf-integration-alexa-example
Copied: JulesP96_Sample-Pulumi
Copied: iac-factory_gitlab-runner-cdktf
Copied: uma-arai_iac-story-code
Copied: RahulMR42_pulumi-python-oci-instances
Copied: RafaelFigueiredo_pulumi-digitalocean-k8s-linkerd
Copied: marcaurele_pulumi-cloud-demo
Copied: lakindu2002_pulumi-micro-stac

In [28]:
import os, pandas as pd

features = []

for root, _, files in os.walk("iac_subset"):
    for f in files:
        if f.endswith((".tf", ".json", ".yml", ".yaml", ".bicep")):
            path = os.path.join(root, f)

            try:
                size = os.path.getsize(path)
                if size > 1_000_000:  # Skip >1MB files
                    continue

                # quick check if file is readable
                valid = False
                try:
                    with open(path, "r", errors="ignore") as raw:
                        snippet = raw.read(300).lower()
                        valid = True if snippet else False
                except:
                    snippet = ""

                features.append({
                    "repo": os.path.basename(root),
                    "file": f,
                    "type": f.split(".")[-1],
                    "size_bytes": size,
                    "lines": sum(1 for _ in open(path, "r", errors="ignore")),
                    "parsed": valid,
                    "raw_text_snippet": snippet
                })
            except Exception:
                pass

df = pd.DataFrame(features)
df.to_csv("iac_features_raw.csv", index=False)
print("✅ Saved parsed base dataset to iac_features_raw.csv")
print("Total IaC files parsed:", len(df))
print(df.head())


✅ Saved parsed base dataset to iac_features_raw.csv
Total IaC files parsed: 35816
                      repo               file  type  size_bytes  lines  \
0  gobengo_activitypub.com   Pulumi.prod.yaml  yaml         281      7   
1  gobengo_activitypub.com  package-lock.json  json      118347   2557   
2  gobengo_activitypub.com        Pulumi.yaml  yaml          97      3   
3  gobengo_activitypub.com       package.json  json         343     14   
4  gobengo_activitypub.com    Pulumi.dev.yaml  yaml         321      7   

   parsed                                   raw_text_snippet  
0    True  config:\n  activitypub.com-cloud:apex-dns-stac...  
1    True  {\n    "name": "activitypub.com-cloud",\n    "...  
2    True  name: activitypub.com-cloud\nruntime: nodejs\n...  
3    True  {\n    "name": "activitypub.com-cloud",\n    "...  
4    True  config:\n  activitypub.com-cloud:apex-dns-stac...  


In [29]:
import os
import pandas as pd
from tqdm import tqdm

# Define IaC file extensions
iac_exts = (".tf", ".json", ".yaml", ".yml", ".bicep")
base_dir = "iac_repos"

records = []

# Walk all repos
for repo in tqdm(os.listdir(base_dir), desc="Repos"):
    repo_path = os.path.join(base_dir, repo)
    for root, dirs, files in os.walk(repo_path):
        for f in files:
            if f.endswith(iac_exts) or f.lower() in ["cdktf.json", "pulumi.yaml"]:
                file_path = os.path.join(root, f)
                try:
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
                        content = fh.read()
                    records.append({
                        "repo": repo,
                        "file": os.path.relpath(file_path, repo_path),
                        "type": f.split(".")[-1].lower(),
                        "size_bytes": os.path.getsize(file_path),
                        "lines": content.count("\n") + 1,
                        "full_content": content
                    })
                except Exception as e:
                    print(f"⚠️ Error reading {file_path}: {e}")

# Convert to DataFrame
df = pd.DataFrame(records)
print("Total IaC files parsed:", df.shape[0])

# Save as Parquet (compressed, efficient)
df.to_parquet("iac_files_full.parquet", engine="pyarrow", compression="snappy")

print("✅ Saved full IaC dataset to iac_files_full.parquet")


Repos:  60%|█████▉    | 561/936 [00:13<00:08, 42.91it/s]

⚠️ Error reading iac_repos/nullify005_HomeAssistant/app/config/service_account.json: [Errno 2] No such file or directory: 'iac_repos/nullify005_HomeAssistant/app/config/service_account.json'
⚠️ Error reading iac_repos/nullify005_HomeAssistant/app/config/secrets.yaml: [Errno 2] No such file or directory: 'iac_repos/nullify005_HomeAssistant/app/config/secrets.yaml'


Repos: 100%|██████████| 936/936 [00:27<00:00, 34.03it/s]


Total IaC files parsed: 35278
✅ Saved full IaC dataset to iac_files_full.parquet


In [None]:
# Colab-specific drive mount removed for local execution


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Skipping Google Drive backup when running locally.
print('Skipping Drive backup; running locally.')


✅ Backup saved to Google Drive at /content/drive/MyDrive/iac_files_full.parquet


In [None]:
# If checkov is not installed, run: pip install checkov


Collecting checkov
  Downloading checkov-3.2.473-py3-none-any.whl.metadata (26 kB)
Collecting bc-python-hcl2==0.4.3 (from checkov)
  Downloading bc_python_hcl2-0.4.3-py3-none-any.whl.metadata (4.2 kB)
Collecting bc-detect-secrets==1.5.45 (from checkov)
  Downloading bc_detect_secrets-1.5.45-py3-none-any.whl.metadata (23 kB)
Collecting bc-jsonpath-ng==1.6.1 (from checkov)
  Downloading bc_jsonpath_ng-1.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pycep-parser==0.5.1 (from checkov)
  Downloading pycep_parser-0.5.1-py3-none-any.whl.metadata (2.6 kB)
Collecting colorama<0.5.0,>=0.4.3 (from checkov)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting termcolor<2.4.0,>=1.1.0 (from checkov)
  Downloading termcolor-2.3.0-py3-none-any.whl.metadata (5.3 kB)
Collecting junit-xml<2.0,>=1.9 (from checkov)
  Downloading junit_xml-1.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting dpath==2.1.3 (from checkov)
  Downloading dpath-2.1.3-py3-none-any.whl.metadata (15 kB)


In [1]:
import pandas as pd
df = pd.read_parquet("iac_files_full.parquet")
print(df.shape)


(35278, 6)


In [2]:
import os

subset_dir = "iac_subset"
iac_files = []

for root, _, files in os.walk(subset_dir):
    for f in files:
        if f.endswith((".tf", ".yaml", ".yml", ".json", ".bicep")):
            path = os.path.join(root, f)
            with open(path, "r", errors="ignore") as fh:
                head = fh.read(200).lower()
                # Look for IaC markers
                if any(keyword in head for keyword in ["resource", "provider", "apiVersion", "kind", "Resources:"]):
                    iac_files.append(path)

print("Found IaC-like files:", len(iac_files))
print("Examples:")
for f in iac_files[:5]:
    print(" -", f)


Found IaC-like files: 11895
Examples:
 - iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/main.tf
 - iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/modules/route53/main.tf
 - iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/modules/s3/main.tf
 - iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/modules/cloudfront/main.tf
 - iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/modules/acm/main.tf


In [6]:
import subprocess, json

# Use explicit Checkov command path found on this Windows system
CHECKOV_CMD = r"C:\Users\chava\AppData\Roaming\Python\Python311\Scripts\checkov.cmd"

def run_checkov(file_path, timeout=30):
    """Run Checkov on one file and return parsed JSON result."""
    try:
        cmd = [CHECKOV_CMD, "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
        else:
            # Print stderr for clearer diagnostics
            stderr = out.stderr.strip() if out and out.stderr else ''
            print(f"⚠️ Checkov returned no results or error on {file_path}: rc={out.returncode}, stderr={stderr}")
    except subprocess.TimeoutExpired:
        print(f"⏳ Timeout on {file_path}")
    except Exception as e:
        print(f"⚠️ Error running Checkov on {file_path}: {e}")
    return None


In [None]:
test_file = "iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/main.tf"
result = run_checkov(test_file, timeout=30)

if result and "results" in result:
    failed = result["results"].get("failed_checks", [])
    print("🔍 File:", test_file)
    print("   Failed checks:", len(failed))
    if failed:
        print("   Example issue:", failed[0].get("check_id"), "-", failed[0].get("check_name"))
else:
    print("❌ No result")


🔍 File: iac_subset/paiml_python_devops_book/src/chap10-InfraAsCode/terraform/main.tf
   Failed checks: 0


: 

In [None]:
import pandas as pd
df = pd.DataFrame(results)

# Save initial Checkov results locally
save_path = "checkov_results_local.csv"
df.to_csv(save_path, index=False)

print(f"✅ Initial Checkov results saved to {save_path}")
print("Shape:", df.shape)
print(df.head())


✅ Initial Checkov results saved to /content/drive/MyDrive/checkov_results_partial.csv
Shape: (2, 6)
                                                file     check_id  \
0  iac_subset/paiml_python_devops_book/src/chap10...   CKV_AWS_70   
1  iac_subset/paiml_python_devops_book/src/chap10...  CKV_AWS_310   

                                          check_name severity  \
0  Ensure S3 bucket does not allow an action with...     HIGH   
1  Ensure CloudFront distributions should have or...   MEDIUM   

                              resource  \
0                aws_s3_bucket.example   
1  aws_cloudfront_distribution.example   

                                           guideline  
0  https://docs.bridgecrew.io/docs/s3_1-acl-read-...  
1  https://docs.bridgecrew.io/docs/cloudfront_1-e...  


In [None]:
# Running locally: Google Colab drive mount is not available in this environment.
# If you run on Colab and need to remount, call: drive.mount('/content/drive', force_remount=True)
print('Running locally; not attempting to mount /content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
# Load local parquet and checkov CSV if available
try:
    df = pd.read_parquet("iac_files_full.parquet")
except Exception as e:
    print('Failed to load iac_files_full.parquet:', e)
    df = pd.DataFrame()
try:
    scanned = pd.read_csv("checkov_results_local.csv")
except Exception:
    scanned = pd.DataFrame()
print('IaC rows:', getattr(df, 'shape', None), 'Checkov rows:', getattr(scanned, 'shape', None))


(35278, 6) (2, 6)


In [None]:
# Colab-specific drive mount removed for local execution

# Load parsed IaC dataset from local path if available
import pandas as pd
try:
    iac_files = pd.read_parquet("iac_files_full.parquet")
    print("IaC dataset loaded:", iac_files.shape)
except FileNotFoundError:
    print("Local iac_files_full.parquet not found. Skipping this step.")

# Load Checkov results (if exists)
try:
    checkov_results = pd.read_csv("checkov_results_partial.csv")
    print("Checkov results loaded:", checkov_results.shape)
except FileNotFoundError:
    checkov_results = pd.DataFrame(columns=["file","check_id","check_name","severity","resource","guideline"])
    print("No existing Checkov results, starting fresh")

# Show samples (if available)
try:
    display(iac_files.head())
except Exception:
    pass
try:
    display(checkov_results.head())
except Exception:
    pass


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
IaC dataset loaded: (35278, 6)
Checkov results loaded: (2, 6)


Unnamed: 0,repo,file,type,size_bytes,lines,full_content
0,gobengo_activitypub.com,Pulumi.prod.yaml,yaml,281,8,config:\n activitypub.com-cloud:apex-dns-stac...
1,gobengo_activitypub.com,package-lock.json,json,118347,2558,"{\n ""name"": ""activitypub.com-cloud"",\n ""..."
2,gobengo_activitypub.com,Pulumi.yaml,yaml,97,4,name: activitypub.com-cloud\nruntime: nodejs\n...
3,gobengo_activitypub.com,package.json,json,343,15,"{\n ""name"": ""activitypub.com-cloud"",\n ""..."
4,gobengo_activitypub.com,Pulumi.dev.yaml,yaml,321,8,config:\n activitypub.com-cloud:apex-dns-stac...


Unnamed: 0,file,check_id,check_name,severity,resource,guideline
0,iac_subset/paiml_python_devops_book/src/chap10...,CKV_AWS_70,Ensure S3 bucket does not allow an action with...,HIGH,aws_s3_bucket.example,https://docs.bridgecrew.io/docs/s3_1-acl-read-...
1,iac_subset/paiml_python_devops_book/src/chap10...,CKV_AWS_310,Ensure CloudFront distributions should have or...,MEDIUM,aws_cloudfront_distribution.example,https://docs.bridgecrew.io/docs/cloudfront_1-e...


In [3]:
# Aggregate Checkov findings per file and merge with IaC dataset
import pandas as pd

# Ensure datasets exist
try:
    iac_files = pd.read_parquet("iac_files_full.parquet")
    print('Loaded iac_files_full.parquet:', iac_files.shape)
except Exception as e:
    print('iac_files_full.parquet not available locally:', e)
    iac_files = pd.DataFrame()

try:
    checkov_results = pd.read_csv("checkov_results_local.csv")
    print('Loaded checkov_results_local.csv:', checkov_results.shape)
except Exception as e:
    print('checkov_results_local.csv not available:', e)
    checkov_results = pd.DataFrame(columns=["file","check_id","check_name","severity","resource","guideline"]) 

# If Checkov results are present, aggregate per file
if not checkov_results.empty:
    # Group check_ids and severities per file
    agg = checkov_results.groupby('file').agg({
        'check_id': lambda ids: list(pd.unique([i for i in ids if pd.notna(i)])),
        'severity': lambda s: list(pd.unique([i for i in s if pd.notna(i)])),
        'check_name': lambda names: list(pd.unique([n for n in names if pd.notna(n)])),
        'resource': lambda r: list(pd.unique([x for x in r if pd.notna(x)])),
    }).reset_index()
    agg = agg.rename(columns={'check_id': 'check_ids', 'severity': 'severities', 'check_name': 'check_names', 'resource': 'resources'})
else:
    agg = pd.DataFrame(columns=['file','check_ids','severities','check_names','resources'])

# Merge with IaC files (if available). The 'file' column in checkov_results likely contains full paths; attempt to match on suffix.
if not iac_files.empty and not agg.empty:
    # Normalize file paths: make both use forward slashes and compare suffixes
    iac_files['normalized_path'] = iac_files['file'].str.replace('\\\\', '/', regex=True)
    agg['normalized_file'] = agg['file'].str.replace('\\\\', '/', regex=True)

    # Attempt direct join first
    merged = iac_files.merge(agg, left_on='file', right_on='file', how='left')

    # If no matches, try matching by suffix (endswith)
    if merged['check_ids'].isna().all():
        print('Direct file join yielded no matches; attempting suffix match')
        # Build a mapping from basename or suffix to agg row
        agg_indexed = agg.set_index('file')
        def find_checks_for_path(path):
            # exact
            if path in agg_indexed.index:
                return agg_indexed.loc[path].to_dict()
            # suffix match
            for k in agg_indexed.index:
                if path.endswith(k):
                    return agg_indexed.loc[k].to_dict()
            return {'check_ids': [], 'severities': [], 'check_names': [], 'resources': []}

        merged_rows = []
        for _, row in iac_files.iterrows():
            path = row['file']
            checks = find_checks_for_path(path)
            new = row.to_dict()
            new.update(checks)
            merged_rows.append(new)
        merged = pd.DataFrame(merged_rows)
else:
    # If IaC files not available, just save aggregated Checkov results
    if not agg.empty:
        merged = agg.copy()
    else:
        merged = pd.DataFrame()

# Save merged output
out_path = 'iac_with_checkov.csv'
try:
    merged.to_csv(out_path, index=False)
    print('Saved merged IaC + Checkov to', out_path)
    print('Merged shape:', getattr(merged, 'shape', None))
except Exception as e:
    print('Failed to save merged CSV:', e)

# Show sample rows
try:
    display(merged.head())
except Exception:
    print('Merged preview not available')


iac_files_full.parquet not available locally: [Errno 2] No such file or directory: 'iac_files_full.parquet'
Loaded checkov_results_local.csv: (206, 2)


KeyError: 'file'

In [2]:
# Diagnostic: inspect checkov results CSV structure
import pandas as pd
import os

csv_path = 'checkov_results_local.csv'
if os.path.exists(csv_path):
    df_check = pd.read_csv(csv_path)
    print('Shape:', df_check.shape)
    print('\nColumns:')
    print(df_check.columns.tolist())
    print('\nDtypes:')
    print(df_check.dtypes)
    print('\nSample rows:')
    display(df_check.head(10))
else:
    print('No checkov_results_local.csv found')


Shape: (103, 2)

Columns:
['target', 'error']

Dtypes:
target    object
error     object
dtype: object

Sample rows:


Unnamed: 0,target,error
0,d:\CloudGuardAI\iac_subset\DanManson_gcp.iamCu...,[WinError 193] %1 is not a valid Win32 applica...
1,d:\CloudGuardAI\iac_subset\ChanghoonHyun_cdktf...,[WinError 193] %1 is not a valid Win32 applica...
2,d:\CloudGuardAI\iac_subset\Bharathkumarraju_gc...,[WinError 193] %1 is not a valid Win32 applica...
3,d:\CloudGuardAI\iac_subset\AlfredoPardo-zz_pyt...,[WinError 193] %1 is not a valid Win32 applica...
4,d:\CloudGuardAI\iac_subset\DanielMSchmidt_cdkt...,[WinError 193] %1 is not a valid Win32 applica...
5,d:\CloudGuardAI\iac_subset\DanielMSchmidt_cdkt...,[WinError 193] %1 is not a valid Win32 applica...
6,d:\CloudGuardAI\iac_subset\DanielMSchmidt_cdkt...,[WinError 193] %1 is not a valid Win32 applica...
7,d:\CloudGuardAI\iac_subset\DanielMSchmidt_terr...,[WinError 193] %1 is not a valid Win32 applica...
8,d:\CloudGuardAI\iac_subset\JakeGinnivan_pulumi...,[WinError 193] %1 is not a valid Win32 applica...
9,d:\CloudGuardAI\iac_subset\DevOpsRandD_iac_cdk...,[WinError 193] %1 is not a valid Win32 applica...


In [10]:
import os, subprocess, json, pandas as pd
from tqdm import tqdm

# Paths
subset_dir = "iac_subset"
results_path = "/content/drive/MyDrive/checkov_results_partial.csv"

# Load existing results
try:
    checkov_results = pd.read_csv(results_path)
    print("Loaded existing results:", checkov_results.shape)
except FileNotFoundError:
    checkov_results = pd.DataFrame(columns=["file","check_id","check_name","severity","resource","guideline"])
    print("No existing results, starting fresh")

# Collect IaC files
iac_files = []
for root, _, files in os.walk(subset_dir):
    for f in files:
        if f.endswith((".tf", ".json", ".yml", ".yaml", ".bicep")):
            iac_files.append(os.path.join(root, f))

print("Total IaC files:", len(iac_files))

# Remove files already scanned
done_files = set(checkov_results["file"].tolist())
pending_files = [f for f in iac_files if f not in done_files]
print("Already scanned:", len(done_files))
print("Pending:", len(pending_files))

def run_checkov(file_path, timeout=30):
    """Run checkov on one file with a timeout."""
    try:
        cmd = ["checkov", "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
    except subprocess.TimeoutExpired:
        print(f"⏳ Timeout on {file_path}")
    except Exception as e:
        print(f"⚠️ Error on {file_path}: {e}")
    return None

# Batch scanning (small chunks to be safe)
batch_size = 20
for i in tqdm(range(0, len(pending_files), batch_size), desc="Batches"):
    batch = pending_files[i:i+batch_size]
    new_results = []

    for file_path in batch:
        result = run_checkov(file_path, timeout=30)
        if result and "results" in result:
            for failed_check in result["results"].get("failed_checks", []):
                new_results.append({
                    "file": file_path,
                    "check_id": failed_check.get("check_id"),
                    "check_name": failed_check.get("check_name"),
                    "severity": failed_check.get("severity"),
                    "resource": failed_check.get("resource"),
                    "guideline": failed_check.get("guideline")
                })

    # Append new batch results
    if new_results:
        checkov_results = pd.concat([checkov_results, pd.DataFrame(new_results)], ignore_index=True)
        checkov_results.to_csv(results_path, index=False)
        print(f"💾 Saved progress: {checkov_results.shape}")

print("✅ Scanning finished")


Loaded existing results: (2, 6)
Total IaC files: 35966
Already scanned: 2
Pending: 35964


Batches:   0%|          | 0/1799 [00:33<?, ?it/s]


KeyboardInterrupt: 

In [11]:
import os
for root, _, files in os.walk("iac_subset"):
    for f in files[:10]:  # show 10 examples
        print(os.path.join(root, f))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-go-console-slack-notification/main.go
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-go-console-slack-notification/go.sum
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-go-console-slack-notification/README.md
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-go-console-slack-notification/handler/handler.go
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-ts-eks-migrate-nodegroups/echoserver.ts
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-ts-eks-migrate-nodegroups/utils.ts
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-ts-eks-migrate-nodegroups/iam.ts
iac_subset/ServiTrace_ReplicationPackage/applications/pulumi-examples/aws-ts-eks-migrate-nodegroups/Pulumi.yaml
iac_subset/ServiTrace_Replication

In [12]:
import pandas as pd

df = pd.read_parquet("/content/drive/MyDrive/iac_files_full.parquet")
print("IaC dataset restored:", df.shape)


IaC dataset restored: (35278, 6)


In [13]:
import os

subset_dir = "iac_temp"
os.makedirs(subset_dir, exist_ok=True)

# Save 100 sample files for Checkov testing
for i, row in df.head(100).iterrows():
    fpath = os.path.join(subset_dir, f"{row['repo']}_{i}.{row['type']}")
    with open(fpath, "w") as f:
        f.write(row["full_content"])


In [14]:
import subprocess, json

def run_checkov(file_path):
    try:
        cmd = ["checkov", "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
    except Exception as e:
        print("⚠️", e)
    return None


In [16]:
import os

subset_dir = "iac_subset"   # or "iac_repos" if you kept all repos
iac_files = []

for root, _, files in os.walk(subset_dir):
    for f in files:
        if f.endswith((".tf", ".json", ".yml", ".yaml", ".bicep")):
            iac_files.append(os.path.join(root, f))

print("Found IaC files:", len(iac_files))
print("Examples:", iac_files[:5])


Found IaC files: 35966
Examples: ['iac_subset/gobengo_activitypub.com/Pulumi.prod.yaml', 'iac_subset/gobengo_activitypub.com/package-lock.json', 'iac_subset/gobengo_activitypub.com/Pulumi.yaml', 'iac_subset/gobengo_activitypub.com/package.json', 'iac_subset/gobengo_activitypub.com/Pulumi.dev.yaml']


In [17]:
import subprocess, json

def run_checkov(file_path, timeout=60):
    """Run Checkov on a file and return JSON results if available."""
    try:
        cmd = ["checkov", "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
    except subprocess.TimeoutExpired:
        print(f"⏳ Timeout on {file_path}")
    except Exception as e:
        print(f"⚠️ Error on {file_path}: {e}")
    return None


In [19]:
import pandas as pd, os, subprocess, json
from tqdm import tqdm

# Load IaC files
subset_dir = "iac_subset"
iac_files = []
for root, _, files in os.walk(subset_dir):
    for f in files:
        if f.endswith((".tf", ".json", ".yml", ".yaml", ".bicep")):
            iac_files.append(os.path.join(root, f))

print("Total IaC files:", len(iac_files))

# Load already scanned
if os.path.exists("checkov_results_partial.csv"):
    scanned = pd.read_csv("checkov_results_partial.csv")["file"].unique().tolist()
else:
    scanned = []

pending_files = [f for f in iac_files if f not in scanned]
print("Pending files:", len(pending_files))


def run_checkov(file_path, timeout=60):
    try:
        cmd = ["checkov", "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
    except subprocess.TimeoutExpired:
        print(f"⏳ Timeout on {file_path}")
    except Exception as e:
        print(f"⚠️ Error on {file_path}: {e}")
    return None


# Batch size small for testing
batch_size = 100
results = []

for i in tqdm(range(0, len(pending_files), batch_size), desc="Batches"):
    batch = pending_files[i:i+batch_size]

    for file_path in batch:
        result = run_checkov(file_path, timeout=30)
        if result and "results" in result:
            for failed_check in result["results"].get("failed_checks", []):
                results.append({
                    "file": file_path,
                    "check_id": failed_check.get("check_id"),
                    "check_name": failed_check.get("check_name"),
                    "severity": failed_check.get("severity"),
                    "resource": failed_check.get("resource"),
                    "guideline": failed_check.get("guideline")
                })

    # Append results to file (instead of overwrite)
    df = pd.DataFrame(results)
    if os.path.exists("checkov_results_partial.csv"):
        df.to_csv("checkov_results_partial.csv", mode="a", header=False, index=False)
    else:
        df.to_csv("checkov_results_partial.csv", index=False)

    results = []  # clear memory
    break  # <-- REMOVE later; for now, process only 1 batch to test


Total IaC files: 35966
Pending files: 35966


Batches:   0%|          | 0/360 [01:44<?, ?it/s]


KeyboardInterrupt: 

In [20]:
import os, subprocess, json, pandas as pd
from tqdm import tqdm

subset_dir = "iac_subset"

# Collect IaC-like files
iac_files = []
for root, _, files in os.walk(subset_dir):
    for f in files:
        if f.endswith((".tf", ".json", ".yml", ".yaml", ".bicep")):
            iac_files.append(os.path.join(root, f))

print("Total IaC files:", len(iac_files))

# Load already scanned
if os.path.exists("checkov_results_partial.csv"):
    scanned = pd.read_csv("checkov_results_partial.csv")["file"].unique().tolist()
else:
    scanned = []

pending_files = [f for f in iac_files if f not in scanned]
print("Pending files:", len(pending_files))


def run_checkov(file_path, timeout=30):
    """Run Checkov on one file and return parsed results."""
    try:
        cmd = ["checkov", "-f", file_path, "-o", "json"]
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        if out.returncode == 0 and out.stdout.strip():
            return json.loads(out.stdout)
    except subprocess.TimeoutExpired:
        print(f"⏳ Timeout on {file_path}")
    except Exception as e:
        print(f"⚠️ Error on {file_path}: {e}")
    return None


# Use batch size = 10
batch_size = 10
results = []

for i in tqdm(range(0, len(pending_files), batch_size), desc="Batches"):
    batch = pending_files[i:i+batch_size]

    for file_path in batch:
        result = run_checkov(file_path, timeout=30)
        if result and "results" in result:
            for failed_check in result["results"].get("failed_checks", []):
                results.append({
                    "file": file_path,
                    "check_id": failed_check.get("check_id"),
                    "check_name": failed_check.get("check_name"),
                    "severity": failed_check.get("severity"),
                    "resource": failed_check.get("resource"),
                    "guideline": failed_check.get("guideline")
                })

    # Save after each batch (resume safe)
    df = pd.DataFrame(results)
    if not df.empty:
        if os.path.exists("checkov_results_partial.csv"):
            df.to_csv("checkov_results_partial.csv", mode="a", header=False, index=False)
        else:
            df.to_csv("checkov_results_partial.csv", index=False)

    results = []  # free memory
    break  # <-- only run 1 batch for testing


Total IaC files: 35966
Pending files: 35966


Batches:   0%|          | 0/3597 [00:48<?, ?it/s]
