# Repo Chain
- repo를 보고 cloud provider instance 정보 추출
- Language ratio 추출

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_PROJECT"] = "Git Analyzer"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [22]:
from typing import Dict, Optional
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Instance(BaseModel):
    cloud_provider: str = Field(description="Name of cloud provider (GCP or AWS or Azure)")
    name: str = Field(description="Name of instance type")
    cpu: int = Field(description="The number of instance cpu cores")
    ram: float = Field(description="The capacity of instance ram (GiB)")
    storage: int = Field(description="The capacity of instance storage (GiB)")
    gpu: Optional[str] = Field(default=None, description="Name of gpu model and memory (GiB) of instance")
    region: str = Field(description="Region of instance")
    cost_per_hour: float = Field(description="Cost per Hour of instance")
    description: str = Field(description="The detailed process that led to the selection of the minimum specification instance.")
    
class RepoResult(BaseModel):
    gcp: Instance = Field(description="Instance information of GCP")
    aws: Instance = Field(description="Instance information of AWS")
    azure: Instance = Field(description="Instance information of Azure")
    language_ratio: Dict[str, int] = Field(description="The key value is the programming language used and the value is the number of bytes the programming language is used in the entire repository.")

# output_parser = JsonOutputParser(pydantic_object=RepoResult)
output_parser = PydanticOutputParser(pydantic_object=RepoResult)

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", streaming=True)

I0000 00:00:1722680190.988095   21583 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1722680190.989008   21583 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [7]:
from langchain_community.document_loaders import GitLoader

repo_path = "./repo/todo"
loader = GitLoader(
    clone_url="https://github.com/rjwharry/todo.git",
    repo_path=repo_path,
    branch="main",
)
data = loader.load()
backend = list(filter(lambda d: d.metadata["source"].startswith("backend/"), data))
len(backend)

I0000 00:00:1722680191.936129   21583 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722680191.936322   21583 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1722680191.979675   21583 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722680191.979920   21583 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1722680192.012846   21583 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722680192.013070   21583 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1722680192.045877   21583 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722680192.046120   21583 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1722680192.077191   21583 wo

20

In [8]:
from langchain.prompts import PromptTemplate
# Output
# Ensure the output is structured in a clear and detailed manner, adhering to the JSON format specified by the following guide: {format_instruction}
prompt = PromptTemplate(
    template="""
As an expert in analyzing software repositories and estimating resource consumption and environmental impact, your task is to provide a comprehensive analysis of a GitHub repository.

Instructions
Analyze the structure of the files in the provided GitHub repository by examining the metadata in the documentation.
Identify the entry point of the repository.
Determine the minimum resources required to run the repository on GCP, AWS, and Azure platforms.
Estimate the power consumption, and carbon footprint on each platform.
{format_instruction}
Context
The goal is to gain a detailed understanding of the repository’s requirements and its environmental impact. Your analysis should be thorough, taking into account all relevant aspects of the repository and the different cloud platforms.


GitHub Repository
{GITHUB}

Additional Guidelines
Be specific and detailed in your analysis.
Provide calculations and assumptions used in estimating resources and environmental impact.
Compare and contrast the findings across the three cloud platforms (GCP, AWS, Azure).
Use technical terminology appropriately to convey precision and expertise.
""",
    input_variables=["GITHUB"],
    partial_variables={"format_instruction": output_parser.get_format_instructions()}
    )
chain = prompt | llm | output_parser

In [9]:
gemini_result = chain.invoke({"GITHUB": backend})

I0000 00:00:1722680214.542598   53315 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722680214.542884   53315 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


In [18]:
print(gemini_result)
repo_result = gemini_result
repo_result.aws

gcp=Instance(cloud_provider='GCP', name='e2-medium', cpu=2, ram=4.0, storage=10, gpu='None', region='us-central1', cost_per_hour=0.0516, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, an e2-medium instance, which provides a balance of performance and cost, is chosen. 10GB of storage is sufficient for this application and its data. The application does not require a GPU. The us-central1 region offers a good balance of cost and latency for many users.") aws=Instance(cloud_provider='AWS', name='t3.medium', cpu=2, ram=4.0, storage=10, gpu='None', region='us-east-1', cost_per_hour=0.0468, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, a t3.medium instance, which provides a balance of perfor

Instance(cloud_provider='AWS', name='t3.medium', cpu=2, ram=4.0, storage=10, gpu='None', region='us-east-1', cost_per_hour=0.0468, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, a t3.medium instance, which provides a balance of performance and cost, is chosen. 10GB of storage is sufficient for this application and its data. The application does not require a GPU. The us-east-1 region offers a good balance of cost and latency for many users.")

# Get Best Instance
- gemini에서 추론한 instance spec을 기반으로 최신 데이터와 대조하여 가장 싼 가격의 instance 추출

In [31]:
from math import inf
from typing import Dict
from google.cloud.firestore_v1.base_query import FieldFilter, Or, And
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

def get_latest_price(instance: Instance) -> Instance:
    if not firebase_admin._apps:
        cred = credentials.Certificate('firebase-svc-account-key.json')
        app = firebase_admin.initialize_app(cred)
    db = firestore.client()
    ref = db.collection("cloud_cost")
    vendor_filter = FieldFilter("vendor", "==", instance.cloud_provider)
    name_filter = FieldFilter("name", "==", instance.name)
    cpu_filter = FieldFilter("cpu", "==", instance.cpu)
    ram_filter = FieldFilter("ram", "==", instance.ram)
    resource_filter = And(filters=[cpu_filter, ram_filter])
    instance_filter = Or(filters=[name_filter, resource_filter])
    final_filter = And(filters=[vendor_filter, instance_filter])
    docs = ref.where(filter=final_filter).stream()

    lowest_instance = {"cost_per_hour": float(inf)}
    for doc in docs:
        if lowest_instance["cost_per_hour"] > doc.to_dict()["cost_per_hour"]:
            lowest_instance = doc.to_dict()
    instance = Instance(
        cloud_provider=instance.cloud_provider,
        name=lowest_instance["name"], 
        cpu=lowest_instance["cpu"], 
        ram=lowest_instance["ram"], 
        storage=instance.storage,
        gpu=lowest_instance["gpu"],
        region=lowest_instance["region"],
        cost_per_hour=lowest_instance["cost_per_hour"],
        description=instance.description,
    )
    return instance

In [32]:
best_aws_instance = get_latest_price(repo_result.aws)
best_gcp_instance = get_latest_price(repo_result.gcp)

In [33]:
repo_result.aws = best_aws_instance
repo_result.gcp = best_gcp_instance

In [35]:
repo_result

RepoResult(gcp=Instance(cloud_provider='GCP', name='e2-medium', cpu=1, ram=4.0, storage=10, gpu=None, region='us-central1', cost_per_hour=0.0169861111111111, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, an e2-medium instance, which provides a balance of performance and cost, is chosen. 10GB of storage is sufficient for this application and its data. The application does not require a GPU. The us-central1 region offers a good balance of cost and latency for many users."), aws=Instance(cloud_provider='AWS', name='t3.medium', cpu=2, ram=4.0, storage=10, gpu=None, region='us-east-1', cost_per_hour=0.0441, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, a t3.medium instance, which provides

# Calculate estimated power consumption and carbon footprint
- RepoResult를 기반으로 각 instance의 power consumption과 carbon footprint 계산

In [37]:
from typing import Dict
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Estimate(BaseModel):
    power_consumption: str = Field(description="Estimated hourly power consumption while running an instance of the instance_type(kWh)")
    carbon_footprint: str = Field(description="Estimated hourly carbon footprint while running an instance of the instance_type(kg CO2)")
    description: str = Field(description="Detailed calculation process for estimating power consumption and carbon emissions.")
    
class CalculateResult(BaseModel):
    gcp: Estimate = Field(description="Estimate result of GCP")
    aws: Estimate = Field(description="Estimate result of AWS")
    azure: Estimate = Field(description="Estimate result of Azure")

output_parser = PydanticOutputParser(pydantic_object=CalculateResult)

In [38]:
from langchain.prompts import PromptTemplate
# Output
# Ensure the output is structured in a clear and detailed manner, adhering to the JSON format specified by the following guide: {format_instruction}
prompt = PromptTemplate(
    template="""As an expert in cloud computing and sustainability, provide a detailed estimation of the hourly power consumption and carbon footprint for the specified instance specs from the leading cloud providers: AWS, GCP, and Azure.

Instructions:
Analyze the instance specifications provided for each cloud provider.
Calculate the hourly power consumption based on the given specs.
Estimate the carbon footprint associated with the hourly power consumption.
Use the most recent data and metrics available for accurate estimations.
Present the results clearly and concisely.
Instance Specifications:
AWS: {aws}

GCP: {gcp}

Azure: {azure}

Desired Format:
{format_instruction}
Ensure that your calculations are accurate and well-documented. Provide references to any data sources or formulas used in the estimation process.
""",
    input_variables=["aws", "gcp", "azure"],
    partial_variables={"format_instruction": output_parser.get_format_instructions()}
    )
chain = prompt | llm | output_parser

In [39]:
calculate_result = chain.invoke({
    "aws": str(repo_result.aws), 
    "gcp": str(repo_result.gcp), 
    "azure": str(repo_result.azure),
})

I0000 00:00:1722680964.808248   53325 subchannel.cc:806] subchannel 0x7ff40b3b6d50 {address=ipv6:%5B2404:6800:400a:804::200a%5D:443, args={grpc.client_channel_factory=0x7ff411313160, grpc.default_authority=generativelanguage.googleapis.com:443, grpc.dns_enable_srv_queries=1, grpc.http2_scheme=https, grpc.internal.channel_credentials=0x7ff40b61c4c0, grpc.internal.client_channel_call_destination=0x116b9a218, grpc.internal.event_engine=0x7ff40b3a7a00, grpc.internal.security_connector=0x7ff40b3a1c90, grpc.internal.subchannel_pool=0x7ff411319170, grpc.max_receive_message_length=-1, grpc.max_send_message_length=-1, grpc.primary_user_agent=grpc-python/1.65.1, grpc.resource_quota=0x7ff411314900, grpc.server_uri=dns:///generativelanguage.googleapis.com:443}}: connect failed (UNKNOWN:connect: No route to host (65) {created_time:"2024-08-03T19:29:24.808069+09:00"}), backing off for 999 ms
I0000 00:00:1722680965.061562   53326 subchannel.cc:806] subchannel 0x7ff40b3b7590 {address=ipv6:%5B2404:6800

In [40]:
print(calculate_result)

gcp=Estimate(power_consumption='0.05 kWh', carbon_footprint='0.01 kg CO2', description="Based on the GCP Carbon Footprint calculator and considering the e2-medium instance located in us-central1 region has a sustained CPU utilization of 40%, the estimated hourly power consumption is 0.05 kWh. This calculation considers the PUE of Google Cloud's data centers. The carbon footprint is estimated to be 0.01 kg CO2 per hour, based on the region's grid carbon intensity and Google's commitment to renewable energy.") aws=Estimate(power_consumption='0.06 kWh', carbon_footprint='0.025 kg CO2', description="Based on the AWS power consumption data for t3.medium instance and considering a sustained CPU utilization of 40%, the estimated hourly power consumption is 0.06 kWh. This calculation considers the PUE of AWS data centers. The carbon footprint is estimated to be 0.025 kg CO2 per hour, based on the region's grid carbon intensity and AWS's sustainability initiatives.") azure=Estimate(power_consum

# Select the best instance
- instance, language_ratio, estimation 정보를 다 모아서 마지막으로 가장 좋은 instance 선택

In [41]:
from langchain_core.output_parsers import PydanticOutputParser
class InstanceResult(BaseModel):
    instance: Instance = Field(description="Instance information")
    estimate: Estimate = Field(description="Estimate result of instance")

class BestInstance(BaseModel):
    conclusion: InstanceResult = Field(description="The most appropriate among gcp, aws, and azure")

output_parser = PydanticOutputParser(pydantic_object=BestInstance)

In [42]:
from langchain.prompts import PromptTemplate
# Output
# Ensure the output is structured in a clear and detailed manner, adhering to the JSON format specified by the following guide: {format_instruction}
prompt = PromptTemplate(
    template="""As an expert in cloud computing economics and environmental sustainability, identify the most economical and environmentally friendly instance among the provided options from AWS, GCP, and Azure.

Instructions:
Analyze the instance specifications and estimation results provided for each cloud provider.
Compare the hourly cost and carbon footprint of each instance.
Determine which instance offers the best balance of cost efficiency and low environmental impact.
Clearly explain the reasoning behind your choice, supported by data.

Instance Specifications and Estimation Results:

AWS: {aws}

GCP: {gcp}

Azure: {azure}

Desired Format:
{format_instruction}

Ensure that your analysis is thorough and well-documented. Provide references to any data sources or formulas used in the estimation process.
""",
    input_variables=["aws", "gcp", "azure"],
    partial_variables={"format_instruction": output_parser.get_format_instructions()}
    )
chain = prompt | llm | output_parser

In [43]:
aws = InstanceResult(instance=repo_result.aws, estimate=calculate_result.aws)
gcp = InstanceResult(instance=repo_result.gcp, estimate=calculate_result.gcp)
azure = InstanceResult(instance=repo_result.azure, estimate=calculate_result.azure)

In [44]:
best_instance_result = chain.invoke({
    "aws": str(aws), 
    "gcp": str(gcp), 
    "azure": str(azure),
})

In [48]:
best_instance = gemini_best_instance_result

In [49]:
class FinalResponse(BaseModel):
    aws: InstanceResult = Field(description="Information of instance and estimation")
    gcp: InstanceResult = Field(description="Information of instance and estimation")
    azure: InstanceResult = Field(description="Information of instance and estimation")
    conclusion: InstanceResult = Field(description="The most appropriate among gcp, aws, and azure")
    language_ratio: Dict[str, int] = Field(description="The key value is the programming language used and the value is the number of bytes the programming language is used in the entire repository.")

In [50]:
final_response = FinalResponse(
    aws=aws, 
    gcp=gcp, 
    azure=azure, 
    conclusion=best_instance.conclusion, 
    language_ratio=repo_result.language_ratio
)
final_response

FinalResponse(aws=InstanceResult(instance=Instance(cloud_provider='AWS', name='t3.medium', cpu=2, ram=4.0, storage=10, gpu=None, region='us-east-1', cost_per_hour=0.0441, description="This application requires a minimum of 2 vCPUs and 2GB of RAM to run. A simple Java application with Spring Boot and MySQL usually doesn't demand high CPU or memory resources. Therefore, a t3.medium instance, which provides a balance of performance and cost, is chosen. 10GB of storage is sufficient for this application and its data. The application does not require a GPU. The us-east-1 region offers a good balance of cost and latency for many users."), estimate=Estimate(power_consumption='0.06 kWh', carbon_footprint='0.025 kg CO2', description="Based on the AWS power consumption data for t3.medium instance and considering a sustained CPU utilization of 40%, the estimated hourly power consumption is 0.06 kWh. This calculation considers the PUE of AWS data centers. The carbon footprint is estimated to be 0.