In [2]:
from dotenv import load_dotenv
from google.cloud import aiplatform
# This script initializes the Vertex AI environment and loads a model for evaluation.
from google.cloud.aiplatform import Model
import os
load_dotenv("../.env")  # Load environment variables from .env file
import pandas as pd
# Initialize the Vertex AI environment


# Load the model
model_names = [
    "gemini-2.5-pro",
    "gemini-2.0-flash",
    "gemini-2.0-flash-lite",
]


In [45]:

df = pd.read_csv("hf://datasets/autoiac-project/iac-eval/data.csv")
df.columns

Index(['Resource', 'Prompt', 'Rego intent', 'Difficulty', 'Reference output',
       'Intent'],
      dtype='object')

In [44]:

n_few_shot = 5


few_shot_examples = df.sample(n=n_few_shot, random_state=42)
few_shot_indices = few_shot_examples.index


df_remaining = df.drop(index=few_shot_indices).reset_index(drop=True)

def format_example(row):
    return f"Input: {row['Prompt']}\nOutput: \n[CODE] \n {row['Reference output']} \n[/CODE]\n"

few_shot_prompt = "\n".join(format_example(row) for _, row in few_shot_examples.iterrows())

# Print or save the prompt
print(few_shot_prompt)

# Optional: Save remaining data and few-shot prompt if needed
# df_remaining.to_csv("dataset_without_fewshot.csv", index=False)
# with open("few_shot_prompt.txt", "w") as f:
#     f.write(few_shot_prompt)


Input: An AWS service that holds a web server which allows you to upload cat pictures and provides random cat pictures on demand. Accomplish this using the following resources: AWS DynamoDB table, AWS S3 bucket, AWS Lambda function, AWS Lambda permission, AWS API Gateway rest API, AWS API Gateway resource, AWS API Gateway method. Add any necessary resources.
Output: 
[CODE] 
 terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.75"
    }
  }

  required_version = "~> 1.9.8"
}


provider "aws" {
  region  = "us-east-1"
  profile = "admin-1"

  assume_role {
    role_arn = "arn:aws:iam::590184057477:role/yicun-iac"
  }
}

resource "aws_dynamodb_table" "caas" {
  name           = "cat_names"
  hash_key       = "name"
  billing_mode   = "PAY_PER_REQUEST"

  attribute {
    name = "name"
    type = "S"
  }
}

resource "aws_s3_bucket" "caas" {
  bucket_prefix = "cat-image"
}

resource "aws_iam_role" "lambda_role" {
  name = "lambda_api_gateway_

In [None]:
from google import genai
from google.genai import types
client = genai.Client(
  vertexai=True, project=os.getenv("GOOGLE_PROJECT_ID"), location="us-central1",
)
'''
question = "Write a Terraform script to create an S3 bucket with versioning enabled and server-side encryption using AES-256."

model = "gemini-2.0-flash-lite-001"
response = client.models.generate_content(
  model=model,
  contents = few_shot_prompt + f"\n The question you have to answer: {question}",
  config=types.GenerateContentConfig(
    system_instruction="You are a helpful AWS cloud engineer, specialized in wrinting IaC (Infrastructure as Code) scripts using Terraform." \
        "You are given a question and you must answer it with a Terraform script. The answer must be sourrounded by [CODE] and [/CODE] tags. The script must be valid and executable." \
        "You will find some example to use as a reference of input and output to use as a reference and then you will be asked to answer a question.",
    )
)
print(response.text, end="")'''

[CODE]
 terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.75"
    }
  }

  required_version = "~> 1.9.8"
}

provider "aws" {
  region  = "us-east-1"
  profile = "admin-1"

  assume_role {
    role_arn = "arn:aws:iam::590184057477:role/yicun-iac"
  }
}

resource "aws_s3_bucket" "example" {
  bucket = "my-versioned-bucket-tf-example" # Replace with a globally unique bucket name
  acl    = "private"

  versioning {
    enabled = true
  }

  server_side_encryption_configuration {
    rule {
      apply_server_side_encryption_by_default {
        sse_algorithm = "AES256"
      }
    }
  }
}
[/CODE]

In [55]:
# Get random rows form the dataset
# df = pd.read_csv("hf://datasets/autoiac-project/iac-eval/data
import re
n_tests = 10

def get_last_terraform(text):
    # Use regex to find the last occurrence of [CODE]...[/CODE]
    match = re.search(r'\[CODE\](.*?)\[/CODE\]', text, re.DOTALL)
    if match:
        return match.group(1).strip()  # Return the content inside [CODE] tags
    return "No valid Terraform script found."

tests = df.sample(n=n_tests, random_state=42)
tests_index = tests.index

df_remaining = df.drop(index=tests_index).reset_index(drop=True)
results = pd.DataFrame(columns=["Prompt", "Reference output", "Gemini-2.5-pro", "Gemini-2.0-flash", "Gemini-2.0-flash-lite"])
for index, row in tests.iterrows():
    responses = []
    for model_name in model_names:
        response = client.models.generate_content(
            model=model_name,
            contents=few_shot_prompt + f"\n The question you have to answer: {row['Prompt']}",
            config=types.GenerateContentConfig(
                system_instruction="You are a helpful AWS cloud engineer, specialized in wrinting IaC (Infrastructure as Code) scripts using Terraform." \
                    "You are given a question and you must answer it with a Terraform script. The answer must be sourrounded by [CODE] and [/CODE] tags. The script must be valid and executable." \
                    "You will find some example to use as a reference of input and output to use as a reference and then you will be asked to answer a question.",
            )
        )
        print(f"Response from {model_name}: {response.text}")
        # ris = get_last_terraform(response.text)
        # print(f"Cutted Response from {model_name}: {ris}")
        responses.append(response.text)
    results.loc[len(results)] = [row['Prompt'], row['Reference output'], responses[0], responses[1], responses[2]]
# Save results to a CSV file
results.to_csv("model_evaluation_results.csv", index=False)


Response from gemini-2.5-pro: [CODE]
terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.75"
    }
  }

  required_version = "~> 1.9.8"
}

provider "aws" {
  region = "us-east-1"
}

# S3 bucket to store the cat images
resource "aws_s3_bucket" "cat_pictures_bucket" {
  bucket_prefix = "cat-pictures-storage-"
}

# DynamoDB table to store metadata about the cats
resource "aws_dynamodb_table" "cats_table" {
  name         = "cats-table"
  billing_mode = "PAY_PER_REQUEST"
  hash_key     = "ImageName"

  attribute {
    name = "ImageName"
    type = "S"
  }
}

# IAM role for the Lambda function
resource "aws_iam_role" "lambda_exec_role" {
  name = "cat-app-lambda-execution-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "lambda.amazonaws.com"
      }
    }]
  })
}

# IAM policy granting the Lambda function neces

In [56]:
# results dataset cleaning
results = pd.read_csv("model_evaluation_results.csv")
results = results.dropna().reset_index(drop=True)

def extract_or_get_last_terraform(text):
    # Use regex to find the last occurrence of [CODE]...[/CODE]
    match = re.search(r'\[CODE\](.*?)\[/CODE\]', text, re.DOTALL)
    if match:
        return match.group(1).strip()  # Return the content inside [CODE] tags
    return text

results["Gemini-2.5-pro"] = results["Gemini-2.5-pro"].apply(extract_or_get_last_terraform)
results["Gemini-2.0-flash"] = results["Gemini-2.0-flash"].apply(extract_or_get_last_terraform)
results["Gemini-2.0-flash-lite"] = results["Gemini-2.0-flash-lite"].apply(extract_or_get_last_terraform)

results.to_csv("model_evaluation_results_cleaned.csv", index=False)


In [13]:
import bleu
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
results = pd.read_csv("model_evaluation_results_cleaned.csv")
def compute_bleu_score(references, generated_texts):
    """ Compute the BLEU score between a list of references and a list of processed texts """
    scores = []
    for reference, generated_text in zip(references, generated_texts):
        if len(generated_text) == 0:
            scores.append(0)
            continue

        results = bleu.compute(references=[[reference]], predictions=[generated_text])
        scores.append(results.get("bleu", 0.0))
    return scores

def compute_rouge_score(references, generated_texts):
    """ Compute the ROUGE score between a list of references and a list of processed texts """
    return rouge.compute(predictions=generated_texts, references=references, use_aggregator=False)

evaluation = pd.DataFrame(columns=["model", "BLEU_score", "ROUGE1_score", "ROUGE2_score", "ROUGEL_score"])
# Calculate BLEU scores for each model's responses against the reference outputs

for model_name in results.columns[2:]:  # Skip the first two columns (Prompt and Reference output)
    bleu_score = compute_bleu_score(
        references=results["Reference output"],
        generated_texts=results[model_name]
    )

    #comoute the average BLEU score
    bleu_score = sum(bleu_score) / len(bleu_score) if bleu_score else 0
    
    rouge_score = compute_rouge_score(
        references=results["Reference output"],
        generated_texts=results[model_name]
    )
    rouge1 = sum(rouge_score["rouge1"]) / len(rouge_score["rouge1"]) if rouge_score else 0
    rouge2 = sum(rouge_score["rouge2"]) / len(rouge_score["rouge2"]) if rouge_score else 0
    rougel = sum(rouge_score["rougeLsum"]) / len(rouge_score["rougeLsum"]) if rouge_score else 0
    
    evaluation.loc[len(evaluation)] = [model_name, bleu_score, rouge1,rouge2, rougel]

evaluation.to_csv("model_evaluation_scores.csv", index=False)



In [None]:
'''eval_result = EvalTask(
    dataset=eval_dataset, metrics=[text_quality], experiment=EXPERIMENT_NAME
).evaluate()

aiplatform.ExperimentRun(
    run_name=eval_result.metadata["experiment_run"],
    experiment=eval_result.metadata["experiment"],
).delete()'''