In [1]:
from langchain_community.document_loaders import GitLoader

repo_path = "./repo/todo"
loader = GitLoader(
    # clone_url="https://github.com/dudaji/git-inspector.git",
    clone_url="https://github.com/rjwharry/todo.git",
    repo_path=repo_path,
    branch="main",
)

In [2]:
data = loader.load()
len(data)

56

In [3]:
backend = list(filter(lambda d: d.metadata["source"].startswith("backend/"), data))

In [4]:
print(len(backend))

20


In [10]:
backend[-1]

Document(metadata={'source': 'backend/kotlin/src/main/kotlin/com/practice/kotlin/services/TodoService.kt', 'file_path': 'backend/kotlin/src/main/kotlin/com/practice/kotlin/services/TodoService.kt', 'file_name': 'TodoService.kt', 'file_type': '.kt'}, page_content='package com.practice.kotlin.services\n\nimport com.practice.kotlin.dto.TodoDto\nimport com.practice.kotlin.repositories.STATUS\nimport com.practice.kotlin.repositories.Todo\nimport com.practice.kotlin.repositories.TodoRepository\nimport org.springframework.stereotype.Service\n\n@Service\nclass TodoService(private val todoRepository: TodoRepository) {\n\tfun getAllTodos(): List<Todo> {\n\t\treturn todoRepository.findAll()\n\t}\n\n\tfun getTodoById(id: Long): Todo {\n\t\treturn todoRepository.findById(id).get()\n\t}\n\n\n\tfun createTodo(todo: Todo): Todo {\n\t\tval lastTodo = getLastTodoByStatus(todo.status)\n\t\tval newTodo = todoRepository.save(todo)\n\t\tnewTodo.prev = lastTodo?.id\n\t\tnewTodo.next = null\n\t\ttodoRepositor

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_PROJECT"] = "Git Analyzer"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", streaming=True)

  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1722603488.689705  360924 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1722603488.700513  360924 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1722603488.702067  360924 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [10]:
from typing import Dict
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Cost(BaseModel):
    # monthly: str = Field(description="Estimated monthly measurements")
    hourly: str = Field(description="Estimated measurements per hour")

class Instance(BaseModel):
    cloud_provider: str = Field(description="Name of cloud provider (GCP or AWS or Azure)")
    name: str = Field(description="Name of instance type")
    cpu: int = Field(description="The number of instance cpu cores")
    ram: float = Field(description="The capacity of instance ram (GiB)")
    storage: int = Field(description="The capacity of instance storage (GiB)")
    gpu: str = Field(description="Name of gpu model and memory (GiB) of instance")
    region: str = Field(description="Region of instance")
    cost: Cost = Field(description="Monthly and Hourly cost of instance")

class Estimate(BaseModel):
    instance: Instance = Field(description="Instance information of Cloud Provider")
    # pricing: Cost = Field(description="Estimated pricing while running an instance of the instance_type (USD)")
    power_consumption: Cost = Field(description="Estimated power consumption while running an instance of the instance_type(kWh)")
    carbon_footprint: Cost = Field(description="Estimated carbon footprint while running an instance of the instance_type(kg CO2)")
    description: str = Field(description="A rationale and detailed explanation for estimations")
    

class Result(BaseModel):
    gcp: Estimate = Field(description="Estimated Result of Google Cloud Platform(GCP)")
    aws: Estimate = Field(description="Estimated Result of Amazon Web Services(AWS)")
    azure: Estimate = Field(description="Estimated Result of Microsoft Azure")
    # conclusion: Estimate = Field(description="The most appropriate among gcp, aws, and azure")
    language_ratio: Dict[str, int] = Field(description="The key value is the programming language used and the value is the number of bytes in which the programming language is used.")

output_parser = JsonOutputParser(pydantic_object=Result)
# output_parser = JsonOutputParser()

In [8]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
# Output
# Ensure the output is structured in a clear and detailed manner, adhering to the JSON format specified by the following guide: {format_instruction}
prompt = PromptTemplate(
    template="""
As an expert in analyzing software repositories and estimating resource consumption and environmental impact, your task is to provide a comprehensive analysis of a GitHub repository.

Instructions
Analyze the structure of the files in the provided GitHub repository by examining the metadata in the documentation.
Identify the entry point of the repository.
Determine the minimum resources required to run the repository on GCP, AWS, and Azure platforms.
Estimate the power consumption, and carbon footprint on each platform.
{format_instruction}
Context
The goal is to gain a detailed understanding of the repository’s requirements and its environmental impact. Your analysis should be thorough, taking into account all relevant aspects of the repository and the different cloud platforms.


GitHub Repository
{GITHUB}

Additional Guidelines
Be specific and detailed in your analysis.
Provide calculations and assumptions used in estimating resources and environmental impact.
Compare and contrast the findings across the three cloud platforms (GCP, AWS, Azure).
Use technical terminology appropriately to convey precision and expertise.
""",
    input_variables=["GITHUB"],
    partial_variables={"format_instruction": output_parser.get_format_instructions()}
    )
chain = prompt | llm | output_parser
# chain = prompt | llm | StrOutputParser()

In [9]:
# for chunk in chain.stream({"GITHUB": backend}):
#     print(chunk, end="", flush=True)
answer = chain.invoke({"GITHUB": backend})
answer

I0000 00:00:1722603507.831725  363223 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722603507.832005  363223 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


{'gcp': {'instance': {'cloud_provider': 'GCP',
   'name': 'e2-micro',
   'cpu': '2',
   'ram': 0.6,
   'storage': 30,
   'gpu': 'None',
   'region': 'us-central1',
   'cost': {'monthly': '$13.04', 'hourly': '$0.006'}},
  'power_consumption': {'monthly': '9.78 kWh', 'hourly': '0.004 kWh'},
  'carbon_footprint': {'monthly': '4.401 kg CO2', 'hourly': '0.002 kg CO2'},
  'description': "This Kotlin Spring Boot application, connecting to a MySQL database, can be run on a minimal GCP e2-micro instance. The e2-micro, with 2 vCPUs and 0.6 GB memory, suffices for development and light traffic. We estimate 30GB storage for the application and database. The estimated cost is based on GCP's pricing, and power consumption is a conservative estimate. The carbon footprint is derived using US-central1's carbon efficiency.  "},
 'aws': {'instance': {'cloud_provider': 'AWS',
   'name': 't4g.nano',
   'cpu': '2',
   'ram': 0.5,
   'storage': 20,
   'gpu': 'None',
   'region': 'us-east-1',
   'cost': {'mon